Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion internal/api/router/router.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
package router

import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"strings"
Expand All @@ -19,6 +21,12 @@ import (
"github.com/modelcontextprotocol/registry/internal/telemetry"
)

// statusClientClosed mirrors NGINX's non-standard 499 — used for requests
// where the client disconnected before we finished. Distinguishing these from
// real server errors keeps the availability metric meaningful under bursts of
// scraper traffic that time out and reconnect.
const statusClientClosed = 499

// Middleware configuration options
type middlewareConfig struct {
skipPaths map[string]bool
Expand Down Expand Up @@ -67,6 +75,22 @@ func MetricTelemetryMiddleware(metrics *telemetry.Metrics, options ...Middleware
duration := time.Since(start).Seconds()
statusCode := ctx.Status()

// If the client disconnected before the handler finished, the handler
// likely converted the resulting context.Canceled into a huma 5xx and
// tried to write a response to a closed socket. NGINX records that
// case as a 499 (client closed). Without this remap we count it as a
// server error: a single ServiceNow-style burst that times out a
// few thousand list-servers requests inflates http_errors_total even
// though no client ever saw a 5xx, and the availability alert fires
// on what is effectively just slow responses.
//
// Only context.Canceled is remapped — context.DeadlineExceeded would
// indicate a server-side timeout we set ourselves and should still
// count as a server error if/when we add per-request deadlines.
if reqErr := ctx.Context().Err(); reqErr != nil && errors.Is(reqErr, context.Canceled) {
statusCode = statusClientClosed
}

// Combine common and custom attributes
attrs := []attribute.KeyValue{
attribute.String("method", method),
Expand All @@ -77,7 +101,9 @@ func MetricTelemetryMiddleware(metrics *telemetry.Metrics, options ...Middleware
// Record metrics
metrics.Requests.Add(ctx.Context(), 1, metric.WithAttributes(attrs...))

if statusCode >= 400 {
// Skip the error counter for client-closed requests so the availability
// metric reflects server-visible errors only.
if statusCode >= 400 && statusCode != statusClientClosed {
metrics.ErrorCount.Add(ctx.Context(), 1, metric.WithAttributes(attrs...))
}

Expand Down
119 changes: 119 additions & 0 deletions internal/api/router/router_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package router_test

import (
"context"
"errors"
"net/http"
"net/http/httptest"
"strings"
"testing"

"github.com/danielgtaylor/huma/v2"
"github.com/danielgtaylor/huma/v2/adapters/humago"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/modelcontextprotocol/registry/internal/api/router"
"github.com/modelcontextprotocol/registry/internal/telemetry"
)

// TestMetricMiddleware_ClientCancelledNotCountedAsError verifies that a request
// the client gave up on (request-context Canceled) is recorded as 499 in
// http_requests_total and is NOT incremented in http_errors_total — even when
// the handler returned huma.Error500InternalServerError because its DB call
// surfaced context.Canceled.
//
// Regression test for the case where a ServiceNow-style scraper burst
// generated thousands of internal 500s for cancelled requests, tripping the
// "Availability dropped below 95%" alert despite zero 5xx reaching clients.
func TestMetricMiddleware_ClientCancelledNotCountedAsError(t *testing.T) {
shutdown, metrics, err := telemetry.InitMetrics("test")
require.NoError(t, err)
defer func() { _ = shutdown(context.Background()) }()

mux := http.NewServeMux()
api := humago.New(mux, huma.DefaultConfig("Test API", "1.0.0"))
api.UseMiddleware(router.MetricTelemetryMiddleware(metrics))

// Handler that simulates: DB returns context.Canceled because the client
// disconnected, handler converts to huma 500.
huma.Register(api, huma.Operation{
OperationID: "cancelled-list",
Method: http.MethodGet,
Path: "/v0/servers",
}, func(_ context.Context, _ *struct{}) (*struct{}, error) {
return nil, huma.Error500InternalServerError("Failed to get registry list",
errors.New("error iterating rows: context canceled"))
})

mux.Handle("/metrics", metrics.PrometheusHandler())

cancelledCtx, cancel := context.WithCancel(context.Background())
cancel()

req := httptest.NewRequest(http.MethodGet, "/v0/servers", nil).WithContext(cancelledCtx)
rec := httptest.NewRecorder()
mux.ServeHTTP(rec, req)

scrapeReq := httptest.NewRequest(http.MethodGet, "/metrics", nil)
scrapeRec := httptest.NewRecorder()
mux.ServeHTTP(scrapeRec, scrapeReq)
assert.Equal(t, http.StatusOK, scrapeRec.Code)

body := scrapeRec.Body.String()

// The request is recorded — but as 499, not 500.
assert.True(t,
containsMetric(body, `mcp_registry_http_requests_total`, `status_code="499"`, `path="/v0/servers"`),
"expected requests_total to record status_code=499 for the cancelled request; got:\n%s",
filterMetricLines(body, "mcp_registry_http_requests_total"),
)
assert.False(t,
containsMetric(body, `mcp_registry_http_requests_total`, `status_code="500"`, `path="/v0/servers"`),
"did not expect requests_total to record status_code=500 for a client-cancelled request; got:\n%s",
filterMetricLines(body, "mcp_registry_http_requests_total"),
)

// The error counter is NOT bumped for 499s — that is the point of this fix.
assert.False(t,
containsMetric(body, `mcp_registry_http_errors_total`, `status_code="499"`),
"errors_total must not be incremented for client-cancelled requests; got:\n%s",
filterMetricLines(body, "mcp_registry_http_errors_total"),
)
assert.False(t,
containsMetric(body, `mcp_registry_http_errors_total`, `status_code="500"`, `path="/v0/servers"`),
"errors_total must not record a 500 for a request the client already gave up on; got:\n%s",
filterMetricLines(body, "mcp_registry_http_errors_total"),
)
}

// containsMetric returns true iff some line in body starts with `metric{...}`
// and that label set contains every requested label fragment.
func containsMetric(body, metric string, labels ...string) bool {
for _, line := range strings.Split(body, "\n") {
if !strings.HasPrefix(line, metric+"{") {
continue
}
ok := true
for _, l := range labels {
if !strings.Contains(line, l) {
ok = false
break
}
}
if ok {
return true
}
}
return false
}

func filterMetricLines(body, metric string) string {
var lines []string
for _, line := range strings.Split(body, "\n") {
if strings.HasPrefix(line, metric) {
lines = append(lines, line)
}
}
return strings.Join(lines, "\n")
}
Loading