diff --git a/internal/api/router/router.go b/internal/api/router/router.go index 46b975b5..7d642582 100644 --- a/internal/api/router/router.go +++ b/internal/api/router/router.go @@ -2,7 +2,9 @@ package router import ( + "context" "encoding/json" + "errors" "fmt" "net/http" "strings" @@ -19,6 +21,12 @@ import ( "github.com/modelcontextprotocol/registry/internal/telemetry" ) +// statusClientClosed mirrors NGINX's non-standard 499 — used for requests +// where the client disconnected before we finished. Distinguishing these from +// real server errors keeps the availability metric meaningful under bursts of +// scraper traffic that time out and reconnect. +const statusClientClosed = 499 + // Middleware configuration options type middlewareConfig struct { skipPaths map[string]bool @@ -67,6 +75,22 @@ func MetricTelemetryMiddleware(metrics *telemetry.Metrics, options ...Middleware duration := time.Since(start).Seconds() statusCode := ctx.Status() + // If the client disconnected before the handler finished, the handler + // likely converted the resulting context.Canceled into a huma 5xx and + // tried to write a response to a closed socket. NGINX records that + // case as a 499 (client closed). Without this remap we count it as a + // server error: a single ServiceNow-style burst that times out a + // few thousand list-servers requests inflates http_errors_total even + // though no client ever saw a 5xx, and the availability alert fires + // on what is effectively just slow responses. + // + // Only context.Canceled is remapped — context.DeadlineExceeded would + // indicate a server-side timeout we set ourselves and should still + // count as a server error if/when we add per-request deadlines. + if reqErr := ctx.Context().Err(); reqErr != nil && errors.Is(reqErr, context.Canceled) { + statusCode = statusClientClosed + } + // Combine common and custom attributes attrs := []attribute.KeyValue{ attribute.String("method", method), @@ -77,7 +101,9 @@ func MetricTelemetryMiddleware(metrics *telemetry.Metrics, options ...Middleware // Record metrics metrics.Requests.Add(ctx.Context(), 1, metric.WithAttributes(attrs...)) - if statusCode >= 400 { + // Skip the error counter for client-closed requests so the availability + // metric reflects server-visible errors only. + if statusCode >= 400 && statusCode != statusClientClosed { metrics.ErrorCount.Add(ctx.Context(), 1, metric.WithAttributes(attrs...)) } diff --git a/internal/api/router/router_test.go b/internal/api/router/router_test.go new file mode 100644 index 00000000..f61b1ded --- /dev/null +++ b/internal/api/router/router_test.go @@ -0,0 +1,119 @@ +package router_test + +import ( + "context" + "errors" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/danielgtaylor/huma/v2" + "github.com/danielgtaylor/huma/v2/adapters/humago" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/modelcontextprotocol/registry/internal/api/router" + "github.com/modelcontextprotocol/registry/internal/telemetry" +) + +// TestMetricMiddleware_ClientCancelledNotCountedAsError verifies that a request +// the client gave up on (request-context Canceled) is recorded as 499 in +// http_requests_total and is NOT incremented in http_errors_total — even when +// the handler returned huma.Error500InternalServerError because its DB call +// surfaced context.Canceled. +// +// Regression test for the case where a ServiceNow-style scraper burst +// generated thousands of internal 500s for cancelled requests, tripping the +// "Availability dropped below 95%" alert despite zero 5xx reaching clients. +func TestMetricMiddleware_ClientCancelledNotCountedAsError(t *testing.T) { + shutdown, metrics, err := telemetry.InitMetrics("test") + require.NoError(t, err) + defer func() { _ = shutdown(context.Background()) }() + + mux := http.NewServeMux() + api := humago.New(mux, huma.DefaultConfig("Test API", "1.0.0")) + api.UseMiddleware(router.MetricTelemetryMiddleware(metrics)) + + // Handler that simulates: DB returns context.Canceled because the client + // disconnected, handler converts to huma 500. + huma.Register(api, huma.Operation{ + OperationID: "cancelled-list", + Method: http.MethodGet, + Path: "/v0/servers", + }, func(_ context.Context, _ *struct{}) (*struct{}, error) { + return nil, huma.Error500InternalServerError("Failed to get registry list", + errors.New("error iterating rows: context canceled")) + }) + + mux.Handle("/metrics", metrics.PrometheusHandler()) + + cancelledCtx, cancel := context.WithCancel(context.Background()) + cancel() + + req := httptest.NewRequest(http.MethodGet, "/v0/servers", nil).WithContext(cancelledCtx) + rec := httptest.NewRecorder() + mux.ServeHTTP(rec, req) + + scrapeReq := httptest.NewRequest(http.MethodGet, "/metrics", nil) + scrapeRec := httptest.NewRecorder() + mux.ServeHTTP(scrapeRec, scrapeReq) + assert.Equal(t, http.StatusOK, scrapeRec.Code) + + body := scrapeRec.Body.String() + + // The request is recorded — but as 499, not 500. + assert.True(t, + containsMetric(body, `mcp_registry_http_requests_total`, `status_code="499"`, `path="/v0/servers"`), + "expected requests_total to record status_code=499 for the cancelled request; got:\n%s", + filterMetricLines(body, "mcp_registry_http_requests_total"), + ) + assert.False(t, + containsMetric(body, `mcp_registry_http_requests_total`, `status_code="500"`, `path="/v0/servers"`), + "did not expect requests_total to record status_code=500 for a client-cancelled request; got:\n%s", + filterMetricLines(body, "mcp_registry_http_requests_total"), + ) + + // The error counter is NOT bumped for 499s — that is the point of this fix. + assert.False(t, + containsMetric(body, `mcp_registry_http_errors_total`, `status_code="499"`), + "errors_total must not be incremented for client-cancelled requests; got:\n%s", + filterMetricLines(body, "mcp_registry_http_errors_total"), + ) + assert.False(t, + containsMetric(body, `mcp_registry_http_errors_total`, `status_code="500"`, `path="/v0/servers"`), + "errors_total must not record a 500 for a request the client already gave up on; got:\n%s", + filterMetricLines(body, "mcp_registry_http_errors_total"), + ) +} + +// containsMetric returns true iff some line in body starts with `metric{...}` +// and that label set contains every requested label fragment. +func containsMetric(body, metric string, labels ...string) bool { + for _, line := range strings.Split(body, "\n") { + if !strings.HasPrefix(line, metric+"{") { + continue + } + ok := true + for _, l := range labels { + if !strings.Contains(line, l) { + ok = false + break + } + } + if ok { + return true + } + } + return false +} + +func filterMetricLines(body, metric string) string { + var lines []string + for _, line := range strings.Split(body, "\n") { + if strings.HasPrefix(line, metric) { + lines = append(lines, line) + } + } + return strings.Join(lines, "\n") +}