oracle-watchdog

metrics

import "github.com/afreidah/oracle-watchdog/internal/metrics"

Index

Variables

var (
    MonitorConsulConnected = prometheus.NewGauge(prometheus.GaugeOpts{
        Name: "oracle_watchdog_consul_connected",
        Help: "Whether the monitor is connected to Consul (1=connected, 0=disconnected)",
    })

    MonitorSessionActive = prometheus.NewGauge(prometheus.GaugeOpts{
        Name: "oracle_watchdog_session_active",
        Help: "Whether the Consul session is active (1=active, 0=inactive)",
    })

    MonitorReconnectAttempts = prometheus.NewCounter(prometheus.CounterOpts{
        Name: "oracle_watchdog_reconnect_attempts_total",
        Help: "Total number of Consul reconnection attempts",
    })

    MonitorSessionRenewals = prometheus.NewCounter(prometheus.CounterOpts{
        Name: "oracle_watchdog_session_renewals_total",
        Help: "Total number of successful session renewals",
    })

    MonitorSessionFailures = prometheus.NewCounter(prometheus.CounterOpts{
        Name: "oracle_watchdog_session_failures_total",
        Help: "Total number of session failures (creation or renewal)",
    })
)

var (
    // WgEndpointResolutionFailures counts every tick that failed before the
    // endpoint update could be attempted (DNS lookup error, peer not found,
    // netlink error). Operators should alert on a sustained non-zero rate.
    WgEndpointResolutionFailures = prometheus.NewCounter(prometheus.CounterOpts{
        Name: "oracle_watchdog_wg_endpoint_resolution_failures_total",
        Help: "Total resolver ticks that failed before applying an endpoint update",
    })

    // WgEndpointChanges counts successful peer endpoint updates. A high rate
    // indicates the WAN IP is flapping or the stale-handshake threshold is
    // too aggressive.
    WgEndpointChanges = prometheus.NewCounter(prometheus.CounterOpts{
        Name: "oracle_watchdog_wg_endpoint_changes_total",
        Help: "Total successful peer endpoint updates applied via netlink",
    })

    // WgEndpointLastUpdate records the wall-clock time of the most recent
    // successful endpoint update. Useful for "time since last change"
    // dashboard panels.
    WgEndpointLastUpdate = prometheus.NewGauge(prometheus.GaugeOpts{
        Name: "oracle_watchdog_wg_endpoint_last_update_timestamp_seconds",
        Help: "Unix timestamp of the most recent successful endpoint update",
    })

    // WgEndpointCurrentIP exposes the currently configured peer endpoint IP
    // as a label so dashboards can display it. The label is rotated (old
    // value deleted) on every change to avoid accumulating dead time series.
    WgEndpointCurrentIP = prometheus.NewGaugeVec(prometheus.GaugeOpts{
        Name: "oracle_watchdog_wg_endpoint_current_ip",
        Help: "Always 1; the current peer endpoint IP is encoded in the ip label",
    }, []string{"interface", "peer", "ip"})

    // WgPeerHandshakeAge reports the seconds elapsed since the most recent
    // successful handshake with the tracked peer. A value of -1 indicates no
    // handshake has ever completed.
    WgPeerHandshakeAge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
        Name: "oracle_watchdog_wg_peer_handshake_age_seconds",
        Help: "Seconds since the most recent peer handshake; -1 if never",
    }, []string{"peer"})
)

var (
    AgentConsulConnected = prometheus.NewGauge(prometheus.GaugeOpts{
        Name: "oracle_watchdog_agent_consul_connected",
        Help: "Whether the agent is connected to Consul (1=connected, 0=disconnected)",
    })

    AgentOCIConnected = prometheus.NewGauge(prometheus.GaugeOpts{
        Name: "oracle_watchdog_agent_oci_connected",
        Help: "Whether the agent is connected to OCI (1=connected, 0=disconnected)",
    })

    AgentNodesMonitored = prometheus.NewGauge(prometheus.GaugeOpts{
        Name: "oracle_watchdog_agent_nodes_monitored",
        Help: "Number of nodes being monitored",
    })

    AgentNodesMissing = prometheus.NewGauge(prometheus.GaugeOpts{
        Name: "oracle_watchdog_agent_nodes_missing",
        Help: "Number of nodes currently missing",
    })

    AgentRestartAttempts = prometheus.NewCounterVec(prometheus.CounterOpts{
        Name: "oracle_watchdog_agent_restart_attempts_total",
        Help: "Total restart attempts per node",
    }, []string{"node"})

    AgentRestartSuccesses = prometheus.NewCounterVec(prometheus.CounterOpts{
        Name: "oracle_watchdog_agent_restart_successes_total",
        Help: "Total successful restarts per node",
    }, []string{"node"})

    AgentRestartFailures = prometheus.NewCounterVec(prometheus.CounterOpts{
        Name: "oracle_watchdog_agent_restart_failures_total",
        Help: "Total failed restarts per node",
    }, []string{"node"})

    AgentConsulCheckFailures = prometheus.NewCounter(prometheus.CounterOpts{
        Name: "oracle_watchdog_agent_consul_check_failures_total",
        Help: "Total Consul check failures",
    })
)

var (
    // WanIPCurrent exposes the most recently detected WAN IPv4 address as a
    // label. Always 1; the value lives in the ip label and is rotated on
    // change so only the active series remains.
    WanIPCurrent = prometheus.NewGaugeVec(prometheus.GaugeOpts{
        Name: "oracle_watchdog_wan_ip_current",
        Help: "Always 1; the current detected WAN IPv4 is encoded in the ip label",
    }, []string{"ip"})

    // WanIPChanges counts how many times the detected WAN IP differed from
    // the previous reading. Drives the "WAN IP flap" alert.
    WanIPChanges = prometheus.NewCounter(prometheus.CounterOpts{
        Name: "oracle_watchdog_wan_ip_changes_total",
        Help: "Total times the detected WAN IP changed between polls",
    })

    // CloudflareRecordUpdates counts attempted DNS record updates split by
    // outcome ("success" or "fail").
    CloudflareRecordUpdates = prometheus.NewCounterVec(prometheus.CounterOpts{
        Name: "oracle_watchdog_cloudflare_record_updates_total",
        Help: "Total Cloudflare DNS record update attempts",
    }, []string{"result"})

    // WanIPDetectionFailures counts WAN-IP detection failures per provider.
    // A provider that consistently fails should be replaced.
    WanIPDetectionFailures = prometheus.NewCounterVec(prometheus.CounterOpts{
        Name: "oracle_watchdog_wan_ip_detection_failures_total",
        Help: "Total WAN-IP detection failures per provider URL",
    }, []string{"provider"})

    // WanDNSLastCheck records the wall-clock time of the most recent
    // detection attempt regardless of outcome.
    WanDNSLastCheck = prometheus.NewGauge(prometheus.GaugeOpts{
        Name: "oracle_watchdog_wan_dns_last_check_timestamp_seconds",
        Help: "Unix timestamp of the most recent WAN-IP detection attempt",
    })

    // WanDNSInCooldown reports whether the updater is currently within the
    // post-update cooldown window (1 = in cooldown, 0 = updates allowed).
    WanDNSInCooldown = prometheus.NewGauge(prometheus.GaugeOpts{
        Name: "oracle_watchdog_wan_dns_in_cooldown",
        Help: "Whether the WAN DNS updater is in the post-update cooldown window",
    })
)

func RegisterAgent

func RegisterAgent()

RegisterAgent registers all agent-mode metrics including the WAN DNS updater metrics. Safe to call multiple times.

func RegisterMonitor

func RegisterMonitor()

RegisterMonitor registers all monitor-mode metrics including the WireGuard endpoint resolver metrics. Safe to call multiple times.

func Serve

func Serve(ctx context.Context, port string)

Serve starts a Prometheus metrics HTTP server on the given port. Blocks until context is cancelled. Port should include colon (e.g. “:9102”).

Generated by gomarkdoc