diff --git a/internal/controller/controller.go b/internal/controller/controller.go index 4c44952..6093d63 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -31,6 +31,11 @@ const ( EventCreated = "CREATED" // EventDeleted indicates that a pod has been deleted. EventDeleted = "DELETED" + + // unknownArtifactTTL is the TTL for cached 404 responses from the + // deployment record API. Once an artifact is known to be missing, + // we suppress further API calls for this duration. + unknownArtifactTTL = 1 * time.Hour ) type ttlCache interface { @@ -66,6 +71,9 @@ type Controller struct { // post requests are idempotent, so if this cache fails due to // restarts or other events, nothing will break. observedDeployments ttlCache + // best effort cache to suppress API calls for artifacts that + // returned a 404 (no artifact found). Keyed by image digest. + unknownArtifacts ttlCache } // New creates a new deployment tracker controller. @@ -113,6 +121,7 @@ func New(clientset kubernetes.Interface, metadataAggregator podMetadataAggregato apiClient: apiClient, cfg: cfg, observedDeployments: amcache.NewExpiring(), + unknownArtifacts: amcache.NewExpiring(), } // Add event handlers to the informer @@ -442,6 +451,16 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta return fmt.Errorf("invalid status: %s", status) } + // Check if this artifact was previously unknown (404 from the API) + if _, exists := c.unknownArtifacts.Get(digest); exists { + dtmetrics.PostDeploymentRecordUnknownArtifactCacheHit.Inc() + slog.Debug("Artifact previously returned 404, skipping post", + "deployment_name", dn, + "digest", digest, + ) + return nil + } + // Extract image name and tag imageName, version := ociutil.ExtractName(container.Image) @@ -471,9 +490,10 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta ) if err := c.apiClient.PostOne(ctx, record); err != nil { - // Return if no artifact is found + // Return if no artifact is found and cache the digest var noArtifactErr *deploymentrecord.NoArtifactError if errors.As(err, &noArtifactErr) { + c.unknownArtifacts.Set(digest, true, unknownArtifactTTL) return nil } diff --git a/internal/controller/controller_test.go b/internal/controller/controller_test.go new file mode 100644 index 0000000..8d7388f --- /dev/null +++ b/internal/controller/controller_test.go @@ -0,0 +1,179 @@ +package controller + +import ( + "context" + "fmt" + "sync" + "testing" + "time" + + "github.com/github/deployment-tracker/pkg/deploymentrecord" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + amcache "k8s.io/apimachinery/pkg/util/cache" +) + +// mockPoster records all PostOne calls and returns a configurable error. +type mockPoster struct { + mu sync.Mutex + calls int + lastErr error +} + +func (m *mockPoster) PostOne(_ context.Context, _ *deploymentrecord.DeploymentRecord) error { + m.mu.Lock() + defer m.mu.Unlock() + m.calls++ + return m.lastErr +} + +func (m *mockPoster) getCalls() int { + m.mu.Lock() + defer m.mu.Unlock() + return m.calls +} + +// newTestController creates a minimal Controller suitable for unit-testing +// recordContainer without a real Kubernetes cluster. +func newTestController(poster *mockPoster) *Controller { + return &Controller{ + apiClient: poster, + cfg: &Config{ + Template: "{{namespace}}/{{deploymentName}}/{{containerName}}", + LogicalEnvironment: "test", + PhysicalEnvironment: "test", + Cluster: "test", + }, + observedDeployments: amcache.NewExpiring(), + unknownArtifacts: amcache.NewExpiring(), + } +} + +// testPod returns a pod with a single container and a known digest. +func testPod(digest string) (*corev1.Pod, corev1.Container) { + container := corev1.Container{ + Name: "app", + Image: "nginx:latest", + } + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + OwnerReferences: []metav1.OwnerReference{{ + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: "test-deployment-abc123", + }}, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{{ + Name: "app", + ImageID: fmt.Sprintf("docker-pullable://nginx@%s", digest), + }}, + }, + } + return pod, container +} + +func TestRecordContainer_UnknownArtifactCachePopulatedOn404(t *testing.T) { + t.Parallel() + digest := "sha256:unknown404digest" + poster := &mockPoster{ + lastErr: &deploymentrecord.NoArtifactError{}, + } + ctrl := newTestController(poster) + pod, container := testPod(digest) + + // First call should hit the API and get a 404 + err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil) + require.NoError(t, err) + assert.Equal(t, 1, poster.getCalls()) + + // Digest should now be in the unknown artifacts cache + _, exists := ctrl.unknownArtifacts.Get(digest) + assert.True(t, exists, "digest should be cached after 404") +} + +func TestRecordContainer_UnknownArtifactCacheSkipsAPICall(t *testing.T) { + t.Parallel() + digest := "sha256:cacheddigest" + poster := &mockPoster{ + lastErr: &deploymentrecord.NoArtifactError{}, + } + ctrl := newTestController(poster) + pod, container := testPod(digest) + + // First call — API returns 404, populates cache + err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil) + require.NoError(t, err) + assert.Equal(t, 1, poster.getCalls()) + + // Second call — should be served from cache, no API call + err = ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil) + require.NoError(t, err) + assert.Equal(t, 1, poster.getCalls(), "API should not be called for cached unknown artifact") +} + +func TestRecordContainer_UnknownArtifactCacheAppliesToDecommission(t *testing.T) { + t.Parallel() + digest := "sha256:decommission404" + poster := &mockPoster{ + lastErr: &deploymentrecord.NoArtifactError{}, + } + ctrl := newTestController(poster) + pod, container := testPod(digest) + + // Deploy call — 404, populates cache + err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil) + require.NoError(t, err) + assert.Equal(t, 1, poster.getCalls()) + + // Decommission call for same digest — should skip API + err = ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDecommissioned, EventDeleted, nil) + require.NoError(t, err) + assert.Equal(t, 1, poster.getCalls(), "decommission should also be skipped for cached unknown artifact") +} + +func TestRecordContainer_UnknownArtifactCacheExpires(t *testing.T) { + t.Parallel() + digest := "sha256:expiringdigest" + poster := &mockPoster{ + lastErr: &deploymentrecord.NoArtifactError{}, + } + ctrl := newTestController(poster) + pod, container := testPod(digest) + + // Seed the cache with a very short TTL to test expiry + ctrl.unknownArtifacts.Set(digest, true, 50*time.Millisecond) + + // Immediately — should be cached + err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil) + require.NoError(t, err) + assert.Equal(t, 0, poster.getCalls(), "should skip API while cached") + + // Wait for expiry + time.Sleep(100 * time.Millisecond) + + // After expiry — should call API again + err = ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil) + require.NoError(t, err) + assert.Equal(t, 1, poster.getCalls(), "should call API after cache expiry") +} + +func TestRecordContainer_SuccessfulPostDoesNotPopulateUnknownCache(t *testing.T) { + t.Parallel() + digest := "sha256:knowndigest" + poster := &mockPoster{lastErr: nil} // success + ctrl := newTestController(poster) + pod, container := testPod(digest) + + err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil) + require.NoError(t, err) + assert.Equal(t, 1, poster.getCalls()) + + // Digest should NOT be in the unknown artifacts cache + _, exists := ctrl.unknownArtifacts.Get(digest) + assert.False(t, exists, "successful post should not cache digest as unknown") +} diff --git a/pkg/deploymentrecord/client.go b/pkg/deploymentrecord/client.go index 9726523..8ea996c 100644 --- a/pkg/deploymentrecord/client.go +++ b/pkg/deploymentrecord/client.go @@ -170,6 +170,9 @@ type NoArtifactError struct { } func (n *NoArtifactError) Error() string { + if n == nil || n.err == nil { + return "no artifact found" + } return fmt.Sprintf("no artifact found: %s", n.err.Error()) } diff --git a/pkg/dtmetrics/prom.go b/pkg/dtmetrics/prom.go index 79c01eb..2143bd9 100644 --- a/pkg/dtmetrics/prom.go +++ b/pkg/dtmetrics/prom.go @@ -88,4 +88,12 @@ var ( Help: "The total number of non-retryable client failures", }, ) + + //nolint: revive + PostDeploymentRecordUnknownArtifactCacheHit = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "deptracker_post_record_unknown_artifact_cache_hit", + Help: "The total number of API calls avoided due to a cached 404 for an unknown artifact digest", + }, + ) )