From 56b5e4a2058d45c86d2ce5459f0040f215aa6e06 Mon Sep 17 00:00:00 2001 From: Anyitechs Date: Tue, 27 Jan 2026 01:44:01 +0100 Subject: [PATCH] Introduce telemetry for observability This introduces the foundational telemetry infrastructure to improve the observability of LDK Server. It adds a new `/metrics` endpoint exposed on the REST service address, which serves Prometheus-compatible metrics. This endpoint is public and does not require HMAC authentication, allowing for easy integration with monitoring systems. - Added `prometheus` dependency and a `Metrics` utility struct. - Introduced a basic `ldk_health_score` gauge (0-100) that reflects the node's operational status based on connection to peer, sync state, and running status. This is the first step in a larger effort to provide comprehensive telemetry. Future updates will expand this to include metrics for channels, balances, payments, and other critical node activities. --- Cargo.lock | 69 +++++++++++++-- ldk-server-protos/src/endpoints.rs | 1 + ldk-server/Cargo.toml | 2 + ldk-server/src/api/error.rs | 12 +++ ldk-server/src/main.rs | 11 +++ ldk-server/src/service.rs | 26 +++++- ldk-server/src/util/metrics.rs | 138 +++++++++++++++++++++++++++++ ldk-server/src/util/mod.rs | 1 + 8 files changed, 251 insertions(+), 9 deletions(-) create mode 100644 ldk-server/src/util/metrics.rs diff --git a/Cargo.lock b/Cargo.lock index 538c9774..06d5d8b0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -121,7 +121,7 @@ dependencies = [ "nom", "num-traits", "rusticata-macros", - "thiserror", + "thiserror 2.0.17", "time", ] @@ -1751,9 +1751,11 @@ dependencies = [ "hyper 1.7.0", "hyper-util", "lapin", + "lazy_static", "ldk-node", "ldk-server-protos", "log", + "prometheus", "prost", "rand 0.8.5", "rcgen", @@ -2135,7 +2137,7 @@ dependencies = [ "rc2", "sha1", "sha2", - "thiserror", + "thiserror 2.0.17", "x509-parser", ] @@ -2361,6 +2363,21 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prometheus" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ca5326d8d0b950a9acd87e6a3f94745394f62e4dae1b1ee22b2bc0c394af43a" +dependencies = [ + "cfg-if", + "fnv", + "lazy_static", + "memchr", + "parking_lot", + "protobuf", + "thiserror 2.0.17", +] + [[package]] name = "prost" version = "0.11.9" @@ -2415,6 +2432,26 @@ dependencies = [ "prost", ] +[[package]] +name = "protobuf" +version = "3.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d65a1d4ddae7d8b5de68153b48f6aa3bba8cb002b243dbdbc55a5afbc98f99f4" +dependencies = [ + "once_cell", + "protobuf-support", + "thiserror 1.0.69", +] + +[[package]] +name = "protobuf-support" +version = "3.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e36c2f31e0a47f9280fb347ef5e461ffcd2c52dd520d8e216b52f93b0b0d7d6" +dependencies = [ + "thiserror 1.0.69", +] + [[package]] name = "quinn" version = "0.11.9" @@ -2429,7 +2466,7 @@ dependencies = [ "rustc-hash", "rustls 0.23.34", "socket2 0.6.1", - "thiserror", + "thiserror 2.0.17", "tokio", "tracing", "web-time", @@ -2450,7 +2487,7 @@ dependencies = [ "rustls 0.23.34", "rustls-pki-types", "slab", - "thiserror", + "thiserror 2.0.17", "tinyvec", "tracing", "web-time", @@ -3255,13 +3292,33 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + [[package]] name = "thiserror" version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" dependencies = [ - "thiserror-impl", + "thiserror-impl 2.0.17", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.108", ] [[package]] @@ -4075,7 +4132,7 @@ dependencies = [ "nom", "oid-registry", "rusticata-macros", - "thiserror", + "thiserror 2.0.17", "time", ] diff --git a/ldk-server-protos/src/endpoints.rs b/ldk-server-protos/src/endpoints.rs index 606da0db..811456d8 100644 --- a/ldk-server-protos/src/endpoints.rs +++ b/ldk-server-protos/src/endpoints.rs @@ -26,3 +26,4 @@ pub const LIST_FORWARDED_PAYMENTS_PATH: &str = "ListForwardedPayments"; pub const UPDATE_CHANNEL_CONFIG_PATH: &str = "UpdateChannelConfig"; pub const GET_PAYMENT_DETAILS_PATH: &str = "GetPaymentDetails"; pub const CONNECT_PEER_PATH: &str = "ConnectPeer"; +pub const GET_METRICS_PATH: &str = "metrics"; diff --git a/ldk-server/Cargo.toml b/ldk-server/Cargo.toml index 965c9604..6b0182cc 100644 --- a/ldk-server/Cargo.toml +++ b/ldk-server/Cargo.toml @@ -23,6 +23,8 @@ toml = { version = "0.8.9", default-features = false, features = ["parse"] } chrono = { version = "0.4", default-features = false, features = ["clock"] } log = "0.4.28" base64 = { version = "0.21", default-features = false, features = ["std"] } +lazy_static = "1.5.0" +prometheus = "0.14.0" # Required for RabittMQ based EventPublisher. Only enabled for `events-rabbitmq` feature. lapin = { version = "2.4.0", features = ["rustls"], default-features = false, optional = true } diff --git a/ldk-server/src/api/error.rs b/ldk-server/src/api/error.rs index 9117c025..a88cac8d 100644 --- a/ldk-server/src/api/error.rs +++ b/ldk-server/src/api/error.rs @@ -131,3 +131,15 @@ impl From for LdkServerError { LdkServerError::new(error_code, message) } } + +impl From for LdkServerError { + fn from(e: prometheus::Error) -> Self { + LdkServerError::new(LdkServerErrorCode::InternalServerError, e.to_string()) + } +} + +impl From for LdkServerError { + fn from(e: std::string::FromUtf8Error) -> Self { + LdkServerError::new(LdkServerErrorCode::InternalServerError, e.to_string()) + } +} diff --git a/ldk-server/src/main.rs b/ldk-server/src/main.rs index ba0731c9..7db42174 100644 --- a/ldk-server/src/main.rs +++ b/ldk-server/src/main.rs @@ -50,6 +50,7 @@ use crate::io::persist::{ use crate::service::NodeService; use crate::util::config::{load_config, ChainSource}; use crate::util::logger::ServerLogger; +use crate::util::metrics::{BUILD_METRICS_INTERVAL, METRICS}; use crate::util::proto_adapter::{forwarded_payment_to_proto, payment_to_proto}; use crate::util::tls::get_or_generate_tls_config; @@ -291,6 +292,16 @@ fn main() { } }; let event_node = Arc::clone(&node); + + let metrics_node = Arc::clone(&node); + let mut interval = tokio::time::interval(BUILD_METRICS_INTERVAL); + runtime.spawn(async move { + loop { + interval.tick().await; + METRICS.update_service_health_score(&metrics_node); + } + }); + let rest_svc_listener = TcpListener::bind(config_file.rest_service_addr) .await .expect("Failed to bind listening port"); diff --git a/ldk-server/src/service.rs b/ldk-server/src/service.rs index 0f050945..c9e7e951 100644 --- a/ldk-server/src/service.rs +++ b/ldk-server/src/service.rs @@ -21,9 +21,9 @@ use ldk_node::Node; use ldk_server_protos::endpoints::{ BOLT11_RECEIVE_PATH, BOLT11_SEND_PATH, BOLT12_RECEIVE_PATH, BOLT12_SEND_PATH, CLOSE_CHANNEL_PATH, CONNECT_PEER_PATH, FORCE_CLOSE_CHANNEL_PATH, GET_BALANCES_PATH, - GET_NODE_INFO_PATH, GET_PAYMENT_DETAILS_PATH, LIST_CHANNELS_PATH, LIST_FORWARDED_PAYMENTS_PATH, - LIST_PAYMENTS_PATH, ONCHAIN_RECEIVE_PATH, ONCHAIN_SEND_PATH, OPEN_CHANNEL_PATH, SPLICE_IN_PATH, - SPLICE_OUT_PATH, UPDATE_CHANNEL_CONFIG_PATH, + GET_METRICS_PATH, GET_NODE_INFO_PATH, GET_PAYMENT_DETAILS_PATH, LIST_CHANNELS_PATH, + LIST_FORWARDED_PAYMENTS_PATH, LIST_PAYMENTS_PATH, ONCHAIN_RECEIVE_PATH, ONCHAIN_SEND_PATH, + OPEN_CHANNEL_PATH, SPLICE_IN_PATH, SPLICE_OUT_PATH, UPDATE_CHANNEL_CONFIG_PATH, }; use prost::Message; @@ -47,6 +47,7 @@ use crate::api::open_channel::handle_open_channel; use crate::api::splice_channel::{handle_splice_in_request, handle_splice_out_request}; use crate::api::update_channel_config::handle_update_channel_config_request; use crate::io::persist::paginated_kv_store::PaginatedKVStore; +use crate::util::metrics::METRICS; use crate::util::proto_adapter::to_error_response; // Maximum request body size: 10 MB @@ -148,6 +149,25 @@ impl Service> for NodeService { type Future = Pin> + Send>>; fn call(&self, req: Request) -> Self::Future { + // Handle metrics endpoint separately to bypass auth and return plain text + if req.uri().path().len() > 1 && &req.uri().path()[1..] == GET_METRICS_PATH { + return Box::pin(async move { + match METRICS.gather_metrics() { + Ok(metrics) => Ok(Response::builder() + .header("Content-Type", "text/plain") + .body(Full::new(Bytes::from(metrics))) + .unwrap()), + Err(e) => { + let (error_response, status_code) = to_error_response(e); + Ok(Response::builder() + .status(status_code) + .body(Full::new(Bytes::from(error_response.encode_to_vec()))) + .unwrap()) + }, + } + }); + } + // Extract auth params from headers (validation happens after body is read) let auth_params = match extract_auth_params(&req) { Ok(params) => params, diff --git a/ldk-server/src/util/metrics.rs b/ldk-server/src/util/metrics.rs new file mode 100644 index 00000000..290138d3 --- /dev/null +++ b/ldk-server/src/util/metrics.rs @@ -0,0 +1,138 @@ +// This file is Copyright its original authors, visible in version control +// history. +// +// This file is licensed under the Apache License, Version 2.0 or the MIT license +// , at your option. +// You may not use this file except in accordance with one or both of these +// licenses. + +use std::time::Duration; + +use lazy_static::lazy_static; +use ldk_node::Node; +use prometheus::{ + default_registry, gather, register_int_gauge_with_registry, Encoder, IntGauge, Opts, Registry, + TextEncoder, +}; + +use crate::api::error::LdkServerError; + +pub const BUILD_METRICS_INTERVAL: Duration = Duration::from_secs(60); + +lazy_static! { + pub static ref METRICS: Metrics = Metrics::new(default_registry()); +} + +pub struct Metrics { + pub service_health_score: IntGauge, +} + +impl Metrics { + pub fn new(registry: &Registry) -> Self { + Self { + service_health_score: register_int_gauge_with_registry!( + Opts::new("ldk_health_score", "Current health score (0-100)"), + registry + ) + .expect("Failed to register metric"), + } + } + + pub fn update_service_health_score(&self, node: &Node) { + let score = self.calculate_ldk_server_health_score(node); + self.service_health_score.set(score); + } + + /// The health score computation is pretty basic for now and simply + /// calculated based on the impacted events on the components of the + /// `Node`. The events severity and weightage value are as follows: + /// + /// - Critical: 0 (Total failure) + /// - Major: 35% + /// - Minor: 25% + /// + /// Using the assigned score above, the health score of the `Node` is + /// computed as: + /// + /// Health score = Maximum health score - Sum(Event severity score) + /// + /// Where: + /// + /// - Maximum health score = 100 + /// + /// If the `Node` is not running/online, i.e `is_running` is false, + /// the severity is critical with a weightage value of -100%. + /// + /// If the `Node` is running but isn't connected to any peer yet, + /// the severity is major with a weightage value of -35%. + /// + /// If the `Node` is running but the Lightning Wallet hasn't been synced + /// yet, the severity is minor with a weightage value of -25%. + pub fn calculate_ldk_server_health_score(&self, node: &Node) -> i64 { + Self::compute_health_score( + node.status().is_running, + !node.list_peers().is_empty(), + node.status().latest_lightning_wallet_sync_timestamp.is_some(), + ) + } + + pub fn gather_metrics(&self) -> Result { + let mut buffer = Vec::new(); + let encoder = TextEncoder::new(); + + let all_metrics = gather(); + encoder.encode(&all_metrics, &mut buffer)?; + Ok(String::from_utf8(buffer)?) + } + + fn compute_health_score(is_running: bool, has_peers: bool, is_wallet_synced: bool) -> i64 { + if !is_running { + return 0; + } + + let mut health_score = 100; + + if !has_peers { + health_score -= 35; + } + + if !is_wallet_synced { + health_score -= 25; + } + + health_score + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_compute_health_score() { + // Node is not running + assert_eq!(Metrics::compute_health_score(false, true, true), 0); + assert_eq!(Metrics::compute_health_score(false, false, false), 0); + + // Node is running, connected to a peer and wallet is synced + assert_eq!(Metrics::compute_health_score(true, true, true), 100); + + // Node is running, not connected to a peer but wallet is synced + assert_eq!(Metrics::compute_health_score(true, false, true), 65); + + // Node is running, connected to a peer but wallet is not synced + assert_eq!(Metrics::compute_health_score(true, true, false), 75); + + // Node is running, not connected to a peer and wallet is not synced + assert_eq!(Metrics::compute_health_score(true, false, false), 40); + } + + #[test] + fn test_gather_metrics_format() { + let result = METRICS.gather_metrics(); + assert!(result.is_ok()); + let output = result.unwrap(); + assert!(output.contains("ldk_health_score")); + } +} diff --git a/ldk-server/src/util/mod.rs b/ldk-server/src/util/mod.rs index 3662b128..1d22bb9e 100644 --- a/ldk-server/src/util/mod.rs +++ b/ldk-server/src/util/mod.rs @@ -9,5 +9,6 @@ pub(crate) mod config; pub(crate) mod logger; +pub(crate) mod metrics; pub(crate) mod proto_adapter; pub(crate) mod tls;