Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ const (
EventCreated = "CREATED"
// EventDeleted indicates that a pod has been deleted.
EventDeleted = "DELETED"

// unknownArtifactTTL is the TTL for cached 404 responses from the
// deployment record API. Once an artifact is known to be missing,
// we suppress further API calls for this duration.
unknownArtifactTTL = 1 * time.Hour
)

type ttlCache interface {
Expand Down Expand Up @@ -66,6 +71,9 @@ type Controller struct {
// post requests are idempotent, so if this cache fails due to
// restarts or other events, nothing will break.
observedDeployments ttlCache
// best effort cache to suppress API calls for artifacts that
// returned a 404 (no artifact found). Keyed by image digest.
unknownArtifacts ttlCache
}

// New creates a new deployment tracker controller.
Expand Down Expand Up @@ -113,6 +121,7 @@ func New(clientset kubernetes.Interface, metadataAggregator podMetadataAggregato
apiClient: apiClient,
cfg: cfg,
observedDeployments: amcache.NewExpiring(),
unknownArtifacts: amcache.NewExpiring(),
}

// Add event handlers to the informer
Expand Down Expand Up @@ -442,6 +451,16 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta
return fmt.Errorf("invalid status: %s", status)
}

// Check if this artifact was previously unknown (404 from the API)
if _, exists := c.unknownArtifacts.Get(digest); exists {
dtmetrics.PostDeploymentRecordUnknownArtifactCacheHit.Inc()
slog.Debug("Artifact previously returned 404, skipping post",
"deployment_name", dn,
"digest", digest,
)
return nil
}

// Extract image name and tag
imageName, version := ociutil.ExtractName(container.Image)

Expand Down Expand Up @@ -471,9 +490,10 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta
)

if err := c.apiClient.PostOne(ctx, record); err != nil {
// Return if no artifact is found
// Return if no artifact is found and cache the digest
var noArtifactErr *deploymentrecord.NoArtifactError
if errors.As(err, &noArtifactErr) {
c.unknownArtifacts.Set(digest, true, unknownArtifactTTL)
return nil
}

Expand Down
179 changes: 179 additions & 0 deletions internal/controller/controller_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
package controller

import (
"context"
"fmt"
"sync"
"testing"
"time"

"github.com/github/deployment-tracker/pkg/deploymentrecord"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
amcache "k8s.io/apimachinery/pkg/util/cache"
)

// mockPoster records all PostOne calls and returns a configurable error.
type mockPoster struct {
mu sync.Mutex
calls int
lastErr error
}

func (m *mockPoster) PostOne(_ context.Context, _ *deploymentrecord.DeploymentRecord) error {
m.mu.Lock()
defer m.mu.Unlock()
m.calls++
return m.lastErr
}

func (m *mockPoster) getCalls() int {
m.mu.Lock()
defer m.mu.Unlock()
return m.calls
}

// newTestController creates a minimal Controller suitable for unit-testing
// recordContainer without a real Kubernetes cluster.
func newTestController(poster *mockPoster) *Controller {
return &Controller{
apiClient: poster,
cfg: &Config{
Template: "{{namespace}}/{{deploymentName}}/{{containerName}}",
LogicalEnvironment: "test",
PhysicalEnvironment: "test",
Cluster: "test",
},
observedDeployments: amcache.NewExpiring(),
unknownArtifacts: amcache.NewExpiring(),
}
}

// testPod returns a pod with a single container and a known digest.
func testPod(digest string) (*corev1.Pod, corev1.Container) {
container := corev1.Container{
Name: "app",
Image: "nginx:latest",
}
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod",
Namespace: "default",
OwnerReferences: []metav1.OwnerReference{{
APIVersion: "apps/v1",
Kind: "ReplicaSet",
Name: "test-deployment-abc123",
}},
},
Status: corev1.PodStatus{
ContainerStatuses: []corev1.ContainerStatus{{
Name: "app",
ImageID: fmt.Sprintf("docker-pullable://nginx@%s", digest),
}},
},
}
return pod, container
}

func TestRecordContainer_UnknownArtifactCachePopulatedOn404(t *testing.T) {
t.Parallel()
digest := "sha256:unknown404digest"
poster := &mockPoster{
lastErr: &deploymentrecord.NoArtifactError{},
}
ctrl := newTestController(poster)
pod, container := testPod(digest)

// First call should hit the API and get a 404
err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil)
require.NoError(t, err)
assert.Equal(t, 1, poster.getCalls())

// Digest should now be in the unknown artifacts cache
_, exists := ctrl.unknownArtifacts.Get(digest)
assert.True(t, exists, "digest should be cached after 404")
}

func TestRecordContainer_UnknownArtifactCacheSkipsAPICall(t *testing.T) {
t.Parallel()
digest := "sha256:cacheddigest"
poster := &mockPoster{
lastErr: &deploymentrecord.NoArtifactError{},
}
ctrl := newTestController(poster)
pod, container := testPod(digest)

// First call — API returns 404, populates cache
err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil)
require.NoError(t, err)
assert.Equal(t, 1, poster.getCalls())

// Second call — should be served from cache, no API call
err = ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil)
require.NoError(t, err)
assert.Equal(t, 1, poster.getCalls(), "API should not be called for cached unknown artifact")
}

func TestRecordContainer_UnknownArtifactCacheAppliesToDecommission(t *testing.T) {
t.Parallel()
digest := "sha256:decommission404"
poster := &mockPoster{
lastErr: &deploymentrecord.NoArtifactError{},
}
ctrl := newTestController(poster)
pod, container := testPod(digest)

// Deploy call — 404, populates cache
err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil)
require.NoError(t, err)
assert.Equal(t, 1, poster.getCalls())

// Decommission call for same digest — should skip API
err = ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDecommissioned, EventDeleted, nil)
require.NoError(t, err)
assert.Equal(t, 1, poster.getCalls(), "decommission should also be skipped for cached unknown artifact")
}

func TestRecordContainer_UnknownArtifactCacheExpires(t *testing.T) {
t.Parallel()
digest := "sha256:expiringdigest"
poster := &mockPoster{
lastErr: &deploymentrecord.NoArtifactError{},
}
ctrl := newTestController(poster)
pod, container := testPod(digest)

// Seed the cache with a very short TTL to test expiry
ctrl.unknownArtifacts.Set(digest, true, 50*time.Millisecond)

// Immediately — should be cached
err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil)
require.NoError(t, err)
assert.Equal(t, 0, poster.getCalls(), "should skip API while cached")

// Wait for expiry
time.Sleep(100 * time.Millisecond)

// After expiry — should call API again
err = ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil)
require.NoError(t, err)
assert.Equal(t, 1, poster.getCalls(), "should call API after cache expiry")
}

func TestRecordContainer_SuccessfulPostDoesNotPopulateUnknownCache(t *testing.T) {
t.Parallel()
digest := "sha256:knowndigest"
poster := &mockPoster{lastErr: nil} // success
ctrl := newTestController(poster)
pod, container := testPod(digest)

err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil)
require.NoError(t, err)
assert.Equal(t, 1, poster.getCalls())

// Digest should NOT be in the unknown artifacts cache
_, exists := ctrl.unknownArtifacts.Get(digest)
assert.False(t, exists, "successful post should not cache digest as unknown")
}
3 changes: 3 additions & 0 deletions pkg/deploymentrecord/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,9 @@ type NoArtifactError struct {
}

func (n *NoArtifactError) Error() string {
if n == nil || n.err == nil {
return "no artifact found"
}
return fmt.Sprintf("no artifact found: %s", n.err.Error())
}

Expand Down
8 changes: 8 additions & 0 deletions pkg/dtmetrics/prom.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,12 @@ var (
Help: "The total number of non-retryable client failures",
},
)

//nolint: revive
PostDeploymentRecordUnknownArtifactCacheHit = promauto.NewCounter(
prometheus.CounterOpts{
Name: "deptracker_post_record_unknown_artifact_cache_hit",
Help: "The total number of API calls avoided due to a cached 404 for an unknown artifact digest",
},
)
)
Loading