From bdb6ee31c130644c5fa8ec5ee92d42081d63c870 Mon Sep 17 00:00:00 2001 From: Sid Mohan Date: Tue, 24 Feb 2026 21:08:47 -0800 Subject: [PATCH 1/7] docs: simplify README and align docs terminology around policy gate --- CONTRIBUTING.md | 23 +- README.md | 305 ++++++++++++++++------- docs/ARCHITECTURE.md | 43 ++++ docs/DATA.md | 38 ++- docs/DESIGN.md | 27 +- docs/DOMAIN_DOCS.md | 36 +-- docs/FRONTEND.md | 32 ++- docs/OBSERVABILITY.md | 64 ++++- docs/PRODUCT_SENSE.md | 27 +- docs/RELIABILITY.md | 72 +++--- docs/contracts/datafog-api-contract.md | 67 ++++- docs/demo.html | 28 +-- docs/design-docs/core-beliefs.md | 24 +- docs/design-docs/index.md | 5 +- docs/runbooks/datafog-claude-agent-ux.md | 12 +- docs/runbooks/datafog-codex-agent-ux.md | 16 +- 16 files changed, 568 insertions(+), 251 deletions(-) create mode 100644 docs/ARCHITECTURE.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0f39d42..a7e2b21 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,17 +1,12 @@ -# datafog contributing guidelines +# Contributing to DataFog -The datafog community appreciates your contributions via issues and -pull requests. Note that the [code of conduct](CODE_OF_CONDUCT.md) -applies to all interactions with the datafog project, including -issues and pull requests. +DataFog welcomes contributions via issues and pull requests. The [Code of Conduct](CODE_OF_CONDUCT.md) applies to all project interactions, including issues and PRs. + +When submitting a pull request: + +- Follow project style guidelines. +- Ensure tests and docs are updated where behavior changes. +- Keep commit messages clear and descriptive. +- Confirm your work is covered by a compatible license in line with the project [`LICENSE`](LICENSE). -When submitting pull requests, please follow the style guidelines of -the project, ensure that your code is tested and documented, and write -good commit messages, e.g., following [these -guidelines](https://chris.beams.io/posts/git-commit/). -By submitting a pull request, you are licensing your code under the -project [license](LICENSE) and affirming that you either own copyright -(automatic for most individuals) or are authorized to distribute under -the project license (e.g., in case your employer retains copyright on -your work). \ No newline at end of file diff --git a/README.md b/README.md index 924959c..2725d27 100644 --- a/README.md +++ b/README.md @@ -1,171 +1,279 @@ -# datafog-api (Go) +# DataFog -This repository implements the MVP `datafog-api` service in Go. +DataFog is a **policy-first Go service** for protecting sensitive data in AI agent and CLI workflows. -## Local development +It evaluates actions (for example: shell commands, file reads, and file writes) before they execute, and +returns a concrete enforcement decision. -```sh -go mod download -go test ./... -go run ./cmd/datafog-api -``` +This repo has two runtime pieces: + +- `datafog-api` – HTTP API for scan/decide/transform/receipts. +- `datafog-shim` – optional runtime policy gate wrapper for CLI-style execution. + +The wrapper process is still named `datafog-shim` for compatibility, but we describe its role as a *policy gate*. + +## What DataFog does (technical) -Default configuration: +1. **Scan text for sensitive entities** (`/v1/scan`). +2. **Evaluate one action against policy rules** (`/v1/decide`). +3. **Apply deterministic transforms** to detected entities (`/v1/transform`, `/v1/anonymize`). +4. **Emit an auditable receipt** for each decision (`/v1/receipts/{id}`). +5. **Optionally emit decision events** (`/v1/events`) when `DATAFOG_EVENTS_PATH` is set. -- `DATAFOG_POLICY_PATH`: `config/policy.json` -- `DATAFOG_RECEIPT_PATH`: `datafog_receipts.jsonl` -- `DATAFOG_ADDR`: `:8080` -- `DATAFOG_API_TOKEN`: optional API token for endpoint protection -- `DATAFOG_RATE_LIMIT_RPS`: `0` (disabled, else max requests per second) -- `DATAFOG_READ_TIMEOUT`: `5s` -- `DATAFOG_WRITE_TIMEOUT`: `10s` -- `DATAFOG_READ_HEADER_TIMEOUT`: `2s` -- `DATAFOG_IDLE_TIMEOUT`: `30s` -- `DATAFOG_SHUTDOWN_TIMEOUT`: `10s` +## What it does not do -Durations accept Go duration syntax (for example: `1s`, `500ms`, `2m`). +- It does not secure every layer of your platform for you. +- It does not continuously discover vulnerabilities. +- It does not manage policy editing UI or dynamic policy updates through the API. +- It does not guarantee zero false positives/negatives from detection (detectors are deterministic and regex/heuristic based). -## HTTP API +## Use cases -All examples use `localhost:8080`. +- Stop risky AI/CLI actions before they run (for example: commands containing API keys). +- Enforce redaction on files created or read by tools. +- Build pre-commit/pre-execution guardrails for internal agents. +- Keep an audit trail for decisions in a local JSONL receipt file. -Canonical contract: +## Repository layout -- See `docs/contracts/datafog-api-contract.md` for endpoint schemas, error codes, and idempotency semantics. -- See `docs/generated/api-schema.md` for registered routes. +- `cmd/datafog-api`: API server. +- `cmd/datafog-shim`: policy-gate wrapper CLI. +- `internal/policy`: policy parsing and matching. +- `internal/scan`: entity detectors. +- `internal/transform`: deterministic redaction/masking/tokenization/anonymization. +- `internal/receipts`: receipt persistence. +- `internal/server`: HTTP handlers and middleware. +- `internal/shim`: decision + execution adapters. +- `config/policy.json`: starter policy used by default. +- `docs/`: API contract and operational docs. -### `GET /health` +## Prerequisites + +- Go **1.22+** +- Optional: Docker (for container workflow) +- Optional: `jq` for pretty-printing JSON + +## Quick start (API only) ```sh -curl http://localhost:8080/health +go mod download +go run ./cmd/datafog-api ``` -### `GET /v1/policy/version` +The API listens on `:8080` by default and requires a valid policy file at `config/policy.json`. + +Verify service is up: ```sh -curl http://localhost:8080/v1/policy/version +curl -i http://localhost:8080/health ``` -### Idempotency +If you set `DATAFOG_API_TOKEN`, send it on every request using: + +- `Authorization: Bearer ` header, or +- `X-API-Key: ` header. + +## Configuration -The following endpoints accept `idempotency_key`: +| Variable | Default | Description | +|---|---:|---| +| `DATAFOG_POLICY_PATH` | `config/policy.json` | Policy snapshot loaded at startup | +| `DATAFOG_RECEIPT_PATH` | `datafog_receipts.jsonl` | Append-only receipts file | +| `DATAFOG_EVENTS_PATH` | *(unset)* | NDJSON event log for decision events | +| `DATAFOG_ADDR` | `:8080` | HTTP listen address | +| `DATAFOG_API_TOKEN` | *(unset)* | Optional API auth token | +| `DATAFOG_RATE_LIMIT_RPS` | `0` | Global request cap in RPS (`0` disables) | +| `DATAFOG_READ_TIMEOUT` | `5s` | HTTP read timeout | +| `DATAFOG_WRITE_TIMEOUT` | `10s` | HTTP write timeout | +| `DATAFOG_READ_HEADER_TIMEOUT` | `2s` | Request-header parse timeout | +| `DATAFOG_IDLE_TIMEOUT` | `30s` | Idle keep-alive timeout | +| `DATAFOG_SHUTDOWN_TIMEOUT` | `10s` | Graceful shutdown timeout | +| `DATAFOG_ENABLE_DEMO` | *(unset)* | Enable `/demo*` endpoints | +| `DATAFOG_DEMO_HTML` | `docs/demo.html` | Path to demo HTML | + +Duration values use Go duration syntax, for example `1s`, `500ms`, `2m`. + +## API surface + +Base URL defaults to `http://localhost:8080`. + +| Method | Path | What it does | +|---|---|---| +| `GET` | `/health` | Health plus policy identity + start time | +| `GET` | `/v1/policy/version` | Current policy id/version | +| `POST` | `/v1/scan` | Run detector set on text | +| `POST` | `/v1/decide` | Evaluate an action + findings and get a decision | +| `POST` | `/v1/transform` | Apply requested transform mode(s) | +| `POST` | `/v1/anonymize` | Apply irreversible anonymization | +| `GET` | `/v1/receipts/{id}` | Read a decision receipt | +| `GET` | `/v1/events` | List recent decision events | +| `GET` | `/metrics` | In-process metrics counters | + +Optional demo routes (only when demo mode is enabled): + +- `GET /demo` +- `POST /demo/exec` +- `POST /demo/write-file` +- `POST /demo/read-file` +- `POST /demo/seed` +- `GET /demo/sandbox` + +## Decisions and idempotency + +Endpoints that accept `idempotency_key`: - `/v1/scan` - `/v1/decide` - `/v1/transform` - `/v1/anonymize` -On repeated requests with the same key: -- identical payload returns the exact same response body and status. -- mismatched payload returns `409` with `code: idempotency_conflict`. +Repeat requests with the same key and identical payload should return the same body and status. +If the same key is reused with a different payload, response is `409` + `idempotency_conflict`. + +## Basic examples -### `POST /v1/scan` +### Scan for entities ```sh curl -X POST http://localhost:8080/v1/scan \ -H "Content-Type: application/json" \ - -d '{"text":"email alice@example.com and card 4111111111111111"}' + -d '{"text":"alice@example.com - API key: SK8x... and 555-123-4567"}' ``` -### `POST /v1/decide` +### Decide action ```sh curl -X POST http://localhost:8080/v1/decide \ -H "Content-Type: application/json" \ - -d '{"action":{"type":"file.write","resource":"notes.txt"},"text":"email alice@example.com"}' + -d '{ + "action": { + "type": "file.write", + "resource": "notes.txt" + }, + "text": "customer email is alice@example.com" + }' ``` -### `POST /v1/transform` +### Transform detected PII in text ```sh curl -X POST http://localhost:8080/v1/transform \ -H "Content-Type: application/json" \ - -d '{"text":"email alice@example.com", "mode":"mask"}' + -d '{ + "text": "customer email is alice@example.com", + "findings": [{"entity_type":"email","value":"alice@example.com","start":18,"end":34,"confidence":0.99}], + "mode":"mask" + }' ``` -### `POST /v1/anonymize` +### Fetch a receipt ```sh -curl -X POST http://localhost:8080/v1/anonymize \ - -H "Content-Type: application/json" \ - -d '{"text":"email alice@example.com", "findings":[{"entity_type":"email","value":"alice@example.com","start":0,"end":17,"confidence":0.99}]}' +curl -s http://localhost:8080/v1/receipts/ | jq . ``` -### `GET /v1/receipts/{id}` +### Query events (optional) ```sh -curl http://localhost:8080/v1/receipts/ +curl 'http://localhost:8080/v1/events?limit=20&decision=deny' ``` -## Tests +## Enforcement policy gate (`datafog-shim`) + +`datafog-shim` is an optional runtime layer for CLI-style workflows. +It sends action details to DataFog (`/v1/decide`) before executing shell/file actions. + +Build it: ```sh -go test ./... +go build -o datafog-shim ./cmd/datafog-shim ``` -## Deployment +Use direct shell mode: -The service is deployed as a single stateless binary with optional mounted policy and receipt storage. +```sh +./datafog-shim --policy-url http://localhost:8080 shell rm -rf /tmp/test +``` -### Local/container quick start +Install a managed wrapper: ```sh -docker build -t datafog-api:v2 . -docker run --rm -p 8080:8080 \ - -e DATAFOG_API_TOKEN=changeme \ - -e DATAFOG_RATE_LIMIT_RPS=50 \ - -e DATAFOG_RECEIPT_PATH=/var/lib/datafog/datafog_receipts.jsonl \ - -v $(pwd)/config:/app/config:ro \ - -v datafog-receipts:/var/lib/datafog \ - datafog-api:v2 +datafog-shim hooks install --target /usr/bin/git git ``` -### Kubernetes-style production pattern +Route wrappers through `PATH`: -Use `/health` for liveness/readiness checks and mount writable storage for receipts. +```sh +export PATH="$HOME/.datafog/shims:$PATH" +``` -## Enforcement shim (runtime gate) +Common env vars for the policy gate: -`datafog-api` is a policy decision service. For runtime enforcement, use the optional shim: +- `DATAFOG_SHIM_POLICY_URL` (required) +- `DATAFOG_SHIM_API_TOKEN` (required if API token is enabled) +- `DATAFOG_SHIM_MODE` (`enforced` or `observe`) +- `DATAFOG_SHIM_EVENT_SINK` (optional NDJSON sink) -```sh -go build -o datafog-shim ./cmd/datafog-shim +When using `enforced` mode, a blocked action exits non-zero. +In `observe` mode, it logs decisions but allows execution to continue. + +Policy gate receipts are logged to `stderr` in a compact format: -./datafog-shim shell --policy-url http://localhost:8080 rm -rf /tmp/test -./datafog-shim hooks install --target /usr/bin/git git -DATAFOG_SHIM_POLICY_URL=http://localhost:8080 git status +```text +receipt= decision= ``` -The shim supports explicit API mode and wrapper-based PATH interception. +## Policy file behavior and limits -`datafog-shim` can call policy checks directly for an arbitrary adapter/action (`run`) or install command shims (`hooks install`) that wrap target binaries. +- Policies live in JSON at `DATAFOG_POLICY_PATH`. +- The policy is loaded on startup only; file edits require restart. +- A restart is the only reload path for policy changes in this version. +- Invalid or malformed JSON blocks startup. -`datafog-shim` calls `/v1/decide` before side-effect actions and only permits actions that resolve to: +`config/policy.json` in this repo is a runnable example with basic allow/deny/redact behavior. -- `allow` -- `allow_with_redaction` +## Limitations and operational notes -Actions that resolve to `transform` or `deny` are blocked until the caller applies an explicit transformation path. +- Detection defaults are fast and deterministic, with bounded coverage. + - Good for common formats (e.g., email, phone, SSN, API keys, credit cards) and lightweight heuristic NER. + - Not a full privacy ML detector. +- Receipt log and event log are file-based and must be writable. +- Large volumes of receipts/events need external retention/rotation strategy. +- `/v1/receipts/{id}` and `/v1/events` are read APIs; there is no policy mutate endpoint. -Supported actions: +## Container quick start -- `shell` (command + args) -- `run [--adapter ] --target ` (adapter name inferred from binary path when omitted) -- `read-file ` -- `write-file ` -- `adapters list` (show built-in adapter families used by shim policy metadata) -- `hooks install ` (PATH interception with generated wrapper) -- `hooks list` -- `hooks uninstall ` +```sh +docker build -t datafog-api:latest . -Decision receipts are returned in stderr for every executed action. +docker run --rm -p 8080:8080 \ + -e DATAFOG_API_TOKEN=changeme \ + -e DATAFOG_RATE_LIMIT_RPS=50 \ + -e DATAFOG_RECEIPT_PATH=/var/lib/datafog/datafog_receipts.jsonl \ + -v "$(pwd)/config:/app/config:ro" \ + -v datafog-receipts:/var/lib/datafog \ + datafog-api:latest +``` -Managed wrapper scripts are generated in `~/.datafog/shims` by default. To use a wrapper in PATH, add that directory to the front of your `PATH`: +## Verify setup end-to-end ```sh -export PATH="$HOME/.datafog/shims:$PATH" +# health check +curl -i http://localhost:8080/health + +# decision + receipt loop +RECEIPT_ID=$(curl -s -X POST http://localhost:8080/v1/decide \ + -H "Content-Type: application/json" \ + -d '{"action":{"type":"shell.exec","command":"git"},"text":"no pii here"}' \ +| jq -r '.receipt_id') + +curl -s http://localhost:8080/v1/receipts/$RECEIPT_ID | jq . ``` +Expected outcome: the first request returns a decision and receipt id; second call should return the saved receipt. + +## Kubernetes deployment example + ```yaml apiVersion: apps/v1 kind: Deployment @@ -198,8 +306,12 @@ spec: value: "/app/config/policy.json" - name: DATAFOG_RECEIPT_PATH value: "/var/lib/datafog/datafog_receipts.jsonl" + - name: DATAFOG_EVENTS_PATH + value: "/var/lib/datafog/datafog_events.ndjson" - name: DATAFOG_RATE_LIMIT_RPS value: "100" + - name: DATAFOG_SHUTDOWN_TIMEOUT + value: "10s" volumeMounts: - name: policy mountPath: /app/config @@ -219,3 +331,24 @@ spec: persistentVolumeClaim: claimName: datafog-receipts ``` + +## Documentation map + +- API contract: `docs/contracts/datafog-api-contract.md` +- Architecture/module map: `docs/ARCHITECTURE.md` +- Security and operations: + - `docs/SECURITY.md` + - `docs/RELIABILITY.md` + - `docs/OBSERVABILITY.md` + - `docs/DOMAIN_DOCS.md` +- Design/product context: + - `docs/DESIGN.md` + - `docs/PRODUCT_SENSE.md` + +## If something fails, check these first + +1. `go test ./...` (build/runtime validation before changing policy) +2. `/health` response for policy id/version mismatch +3. Environment variables are set and files are writable +4. API token/header if `DATAFOG_API_TOKEN` is configured +5. Policy JSON is valid and rules match expected action fields diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..fdf51fb --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,43 @@ +# Architecture + +## Purpose + +- **System purpose:** evaluate policy for sensitive actions, return a deterministic enforcement decision, and keep an auditable trail. +- **Primary users:** AI/automation agents, platform engineers, security teams, and CLI tool users running command wrappers. +- **Main runtime pieces:** HTTP API (`cmd/datafog-api`), policy engine (`internal/policy`), detector stack (`internal/scan`), transform engine (`internal/transform`), and policy gate (`cmd/datafog-shim`, `internal/shim`). +- **Primary flows:** policy decision serving, text scanning/transforming, and policy gate-based command/file enforcement. + +## Codemap (Where To Change Code) + +- `cmd/datafog-api/main.go` -> application bootstrapping, env-var config, policy + receipt initialization, graceful shutdown. +- `internal/server/server.go` -> route registration and all API handlers (`/v1/scan`, `/v1/decide`, `/v1/transform`, `/v1/anonymize`, `/v1/receipts/{id}`, `/v1/events`, `/metrics`, optional demo routes). +- `internal/policy/policy.go` -> policy validation, matching logic, precedence, and decision outcomes. +- `internal/models/models.go` -> shared request/response/domain types used by API, policy, and receipts. +- `internal/scan` -> deterministic text detectors (`detector.go` and heuristic NER in `ner.go`). +- `internal/transform/transform.go` -> deterministic redaction/transformation implementation and stats. +- `internal/receipts/store.go` -> append-only receipt persistence and retrieval. +- `cmd/datafog-shim/main.go` -> CLI command parser and runtime guardrails for shell/commands/files. +- `internal/shim` -> request adaptation, adapter matching, enforcement mode handling, and event sinks. + +### Flow + +`Client`/agent request → `internal/server` route handler → `internal/policy` evaluator + `internal/scan` (when needed) → optional `transform` path → `ReceiptStore` write → JSON response with `receipt_id`. + +For enforcement: CLI/tool action → `datafog-shim` policy gate → `POST /v1/decide` on DataFog API → execute-or-block in `internal/shim` adapter. + +## Invariants (Must Remain True) + +- API policy loading is static for a process lifetime (`DATAFOG_POLICY_PATH` loaded at startup). +- `/health` and `/v1/policy/version` do not mutate state. +- No secrets are logged or returned in API responses. +- Decision side effects are request-scoped and serialized into receipts before returning a `decide` response. +- If a request includes idempotency keys, repeated requests must return identical status/body or a conflict error. +- Unsupported or unauthenticated requests fail closed (`401`, `4xx`, or `405`) before any enforcement action. + +## Details Live Elsewhere + +- `docs/contracts/datafog-api-contract.md` — API request/response contracts. +- `docs/DESIGN.md` — design principles. +- `docs/PRODUCT_SENSE.md` — users/outcomes/heuristics. +- `docs/SECURITY.md` — threat model and controls. +- `docs/RELIABILITY.md` and `docs/OBSERVABILITY.md` — operations and checks. diff --git a/docs/DATA.md b/docs/DATA.md index d8c4e5c..511c291 100644 --- a/docs/DATA.md +++ b/docs/DATA.md @@ -1,28 +1,46 @@ --- title: "Data" -use_when: "Capturing data model and data-change safety rules for this repo (schemas, migrations, backfills, integrity, and operational safety)." +use_when: "Capturing data model and data-change safety rules for this repo." --- ## Data Model -- Source of truth for schemas (ORM models, migrations, schema dump files) and where they live. -- Entity ownership boundaries (what owns IDs, who can write which tables/collections). +DataFog is intentionally storage-light and file-backed by default. + +- **Policy source:** JSON at `DATAFOG_POLICY_PATH` (default `config/policy.json`). +- **Decision receipts:** immutable JSON lines in `DATAFOG_RECEIPT_PATH` (default `datafog_receipts.jsonl`). +- **Decision events (optional):** NDJSON entries in `DATAFOG_EVENTS_PATH`. +- **Domain types:** policy, request, decision, finding, transform plan, and receipts are defined in `internal/models/models.go` and mirrored in the API contract. ## Migrations -- Migration rules (forward-only vs reversible, locking/online migration expectations, index/constraint strategy). -- Validation steps for schema changes (commands and what to check). +There is no database migration layer in this repository. Policy and storage evolution is file-based: + +- Policy changes require replacing `policy.json` and restarting the service. +- Receipt retention/rotation is performed by `internal/receipts` through `maxEntries` options and file archival rules. +- Any migration of persistent data (for receipts/events) must include a compatibility plan before rollout. ## Backfills And Data Fixes -- How to run backfills safely (idempotence, batching, checkpoints). -- How to verify correctness and how to roll back (or compensate) if needed. +No schema migration framework exists currently; backfills are manual and should be scoped: + +- Validate new policy files in non-production first. +- Snapshot the old receipt/event files if they need to be retained before rollout. +- If policy semantics change, rerun representative workloads through `/v1/decide` for behavioral comparison. +- Use bounded rollout and rollback to the previous image/config if receipt interpretation changes unexpectedly. ## Integrity And Consistency -- Constraints and invariants that must remain true (unique keys, foreign keys, referential rules). -- Concurrency expectations (transactions/isolation, retry policies) where relevant. +- Receipt IDs and action/input hashes must remain consistent for auditability. +- Receipt reads/writes are append-only (`Save` appends a JSON line and fsyncs). +- On startup, the service loads existing receipts into memory; duplicate IDs are naturally coalesced by key in-memory map. +- Policy validation runs at startup and rejects invalid schemas before serving traffic. ## Sensitive Data Notes -- Pointers to where sensitive fields live and how they must be handled (logging/redaction, retention, deletion). +- PII in request text is treated as sensitive and only written in controlled forms: + - Scans return findings but do not persist raw payloads in receipts. + - Receipts store action metadata and hashes, not entire request text. + - Transform outputs should be treated as potentially sensitive when logs are shared externally. +- Receipts/events paths should be writable only to tightly scoped directories/volumes. +- Rotate or archive receipts and events per deployment retention policy. diff --git a/docs/DESIGN.md b/docs/DESIGN.md index c70b22c..74c2684 100644 --- a/docs/DESIGN.md +++ b/docs/DESIGN.md @@ -1,17 +1,26 @@ --- title: "Design" -use_when: "Documenting UI/UX design principles, visual direction, and interaction standards for this repo." +use_when: "Documenting API and enforcement design principles, behavior expectations, and consistency standards for DataFog." --- ## Design Principles -- Clarity over cleverness; make the primary action obvious. -- Consistency beats novelty; reuse patterns unless there is a strong reason not to. -- Accessible by default (contrast, focus, keyboard). -## Visual Direction -- Use design tokens (colors, spacing, typography) to keep the UI cohesive. -- Prefer a small, intentional palette and a consistent type scale. +- **Policy-first behavior:** every protected action follows the same decision flow through policy, not ad-hoc command-specific exceptions. +- **Deterministic results:** identical request + policy inputs must yield identical decisions and transform outputs. +- **Fail-closed defaults:** unknown actions, unsupported modes, and missing context should prefer safe denial. +- **Operational transparency:** every meaningful action is observable (receipts, logs, request IDs, metrics). +- **Minimal privilege runtime:** API and policy-gate defaults should require explicit opt-in and avoid broad permissions. + +## Visual/API Direction + +- Keep API responses stable and machine-parseable with consistent field names (`request_id`, `trace_id`, `policy_id`, `policy_version`, `receipt_id`). +- Error objects should always include a machine-readable `code` plus a short human message. +- Use plain, documented environment variables and startup defaults so behavior is scriptable and inspectable. ## Interaction Standards -- Every async action has loading, success, and error states with clear messaging. -- Forms validate inline and preserve user input; errors explain how to recover. + +- Endpoints use standard HTTP semantics (`GET` read-only, `POST` for stateful decisions/transforms). +- Every call should include `Content-Type: application/json` where applicable and support deterministic JSON decoding. +- Idempotency behavior is explicit and user-visible for high-risk endpoints. +- `/health` and read endpoints are safe and should remain side-effect free. +- For enforcement failures, return the structured decision/receipt evidence path so downstream tooling can audit and retry safely. diff --git a/docs/DOMAIN_DOCS.md b/docs/DOMAIN_DOCS.md index 7890143..8acadbb 100644 --- a/docs/DOMAIN_DOCS.md +++ b/docs/DOMAIN_DOCS.md @@ -1,30 +1,30 @@ # Domain Docs Registry -Reference for agents: what domain docs exist, how to detect relevant content, and when to create or update them. Domain docs are deployed at bootstrap with baseline guidance. Flesh them out with real, repo-specific content on demand. +Reference for agents: what domain docs exist, how to detect relevant content, and when to update each file. Keep this map aligned with actual repository content. ## Domain Docs -| Doc | Path | Purpose | Auto-Detect Signals | Seed Question | +| Doc | Path | Purpose | Auto-Detect Signals | Last Updated | |---|---|---|---|---| -| DESIGN.md | `docs/DESIGN.md` | Design principles, visual direction, interaction standards | — | What are your core design principles? | -| DATA.md | `docs/DATA.md` | Data model and data-change safety rules (migrations/backfills/integrity) | `db/`, migrations, ORM schema files, backfill scripts | What are your data model and migration/backfill safety rules? | -| FRONTEND.md | `docs/FRONTEND.md` | Frontend stack, conventions, component architecture | `package.json` (react/vue/angular/svelte), `next.config.*`, `vite.config.*`, `tsconfig.json` | What's your frontend stack and key conventions? | -| PRODUCT_SENSE.md | `docs/PRODUCT_SENSE.md` | Target users, key outcomes, decision heuristics | — | Who are your target users and what outcomes matter most? | -| RELIABILITY.md | `docs/RELIABILITY.md` | Uptime targets, failure modes, operational guardrails | Dockerfile, health check routes, CI config | What are your reliability requirements? | -| SECURITY.md | `docs/SECURITY.md` | Threat model, auth, data sensitivity, compliance | Auth deps, middleware files, env var references | What security concerns apply? | -| OBSERVABILITY.md | `docs/OBSERVABILITY.md` | Logging, metrics, traces, health checks, agent access | Logging libs (winston/pino/structlog/slog), `/metrics`, opentelemetry/jaeger config, `/healthz` | What observability tools do you use? | -| core-beliefs.md | `docs/design-docs/core-beliefs.md` | Non-negotiable engineering beliefs | — | What are 2-3 non-negotiable engineering beliefs? | +| DESIGN.md | `docs/DESIGN.md` | Engineering and API design principles | Policy-driven behavior, enforcement controls, API request/response consistency | 2026-02-24 | +| DATA.md | `docs/DATA.md` | Data model and data-change safety (policy, receipts, events) | `policy.json`, receipt/event files, persistence paths | 2026-02-24 | +| FRONTEND.md | `docs/FRONTEND.md` | Frontend conventions and optional demo surfaces | `docs/demo.html`, `internal/server/demo.go`, demo routes | 2026-02-24 | +| PRODUCT_SENSE.md | `docs/PRODUCT_SENSE.md` | Target users, outcomes, and quality criteria | Requests for new behavior or scope changes | 2026-02-24 | +| RELIABILITY.md | `docs/RELIABILITY.md` | Reliability targets, failure modes, guardrails | Health endpoints, env timeout config, graceful shutdown paths | 2026-02-24 | +| SECURITY.md | `docs/SECURITY.md` | Threat model and runtime controls | API token usage, secret handling, container hardening | 2026-02-24 | +| OBSERVABILITY.md | `docs/OBSERVABILITY.md` | Logs, metrics, and operations access | `/metrics`, `/health`, `/v1/events`, logging output | 2026-02-24 | +| core-beliefs.md | `docs/design-docs/core-beliefs.md` | Core engineering and product beliefs | Roadmap and implementation tradeoffs | 2026-02-24 | ## When to Create or Update -- **he-plan**: Identify relevant/missing domain docs during planning, then create/populate them at end-of-`he-plan` after final plan approval and before transition -- **he-implement**: If implementation reveals a missing, wrong, or incomplete domain doc, create or update it in-place and note in Revision Notes -- **he-learn**: Post-release policy updates from lessons learned -- **he-doc-gardening**: Flag stale domain docs for refresh +- **he-plan:** populate or refresh domain docs during planning handoff when scope requires. +- **he-implement:** update docs when behavior, storage, or observability changes. +- **he-learn:** add outcomes and lessons after major releases. +- **he-doc-gardening:** keep stale domain docs current if templates remain in placeholder form. ## How to Create or Update -1. Check if the domain doc file exists (bootstrap deploys all baseline docs) -2. If the doc has only baseline guidance (template defaults): replace with real, repo-specific content using auto-detect signals and current context -3. If it has real content: append or revise — never overwrite working policies without replacing them with something better -4. Preserve section structure (headings stay, content fills in) +1. Confirm whether the file exists and is currently still template-level guidance. +2. Replace placeholders with repo-specific behavior, or append concrete sections. +3. Keep the structure stable where useful and add concrete evidence references (paths, commands, defaults). +4. When behavior changes, update both this registry and the owning doc in the same commit. diff --git a/docs/FRONTEND.md b/docs/FRONTEND.md index c7315ca..7a649b7 100644 --- a/docs/FRONTEND.md +++ b/docs/FRONTEND.md @@ -1,24 +1,34 @@ --- title: "Frontend" -use_when: "Documenting frontend stack, conventions, component architecture, performance budgets, and accessibility requirements for this repo." +use_when: "Documenting frontend stack conventions and UI touchpoints for this repo." --- ## Stack -- Define supported browsers/platforms and the minimum accessibility target. -- Prefer a small set of core dependencies and consistent build tooling across the app. + +DataFog API is a backend-first project. There is no React/Vue/Next.js application in this repository. + +- Primary user-facing UI is API-first: clients interact through HTTP endpoints. +- Optional demo assets are static HTML in `docs/demo.html` and rendered by `GET /demo` when demo mode is enabled. ## Conventions -- Keep components small and named by what they do; avoid "utils soup" without ownership. -- Centralize shared UI primitives; avoid duplicating patterns across pages. + +- Keep behavior explicit and minimal in UI entrypoints. +- Avoid introducing framework lock-in for optional demo surfaces. +- Maintain parity between API behavior and demo output (e.g., transformed/blocked responses in demo should mirror API semantics). ## Component Architecture -- Separate UI rendering from data fetching/mutations where practical. -- Prefer explicit data flow and local state; introduce global state only with a clear boundary. + +- `internal/server` owns all HTTP handlers, including demo handlers (`internal/server/demo.go`). +- Demo UI is static and delegates control flow to the API; business rules remain server-side. +- Policy gate behavior belongs to `internal/shim` and should not duplicate policy logic in the UI. ## Performance -- Avoid unnecessary client work: minimize re-renders, split code on route/feature boundaries, and lazy-load heavy modules. -- Measure before optimizing; keep a short list of performance budgets that matter to users. + +- No frontend bundling/runtime overhead is shipped as part of the core service. +- For optional demo HTML, prefer lightweight markup/CSS/vanilla JS and short payloads from the API. ## Accessibility -- Keyboard navigation works for all interactive controls; focus states are visible. -- Use semantic HTML first; ARIA is for filling gaps, not replacing semantics. + +- Demo HTML should remain keyboard-operable and avoid hidden controls that block screen readers. +- Use semantic markup in docs pages (`button`, `section`, headings, labels). +- Keep contrast and focus states explicit when editing future visual components. diff --git a/docs/OBSERVABILITY.md b/docs/OBSERVABILITY.md index d346b79..5c7ffab 100644 --- a/docs/OBSERVABILITY.md +++ b/docs/OBSERVABILITY.md @@ -1,25 +1,65 @@ --- title: "Observability" -use_when: "Documenting logging, metrics, tracing, and health check conventions for this repo, including how agents can access signals to self-verify behavior." +use_when: "Documenting logging, metrics, tracing, and health check conventions for this repo, including how agents can access signals to verify behavior." --- ## Logging Strategy -- Prefer structured logs with consistent fields (service, env, request_id/trace_id, user_id when safe). -- Never log secrets; be deliberate about PII. -- Log at boundaries and on errors; avoid noisy per-loop logging in hot paths. + +DataFog logs request lifecycle events to stdout/stderr with request IDs. + +- Every completed API request logs `request_id`, method, path, status, and latency. +- Panics are recovered and logged as 500 errors. +- Receipt/event helper messages are logged to stderr (`decision=...`, `receipt=...`) by the policy gate and written to file sinks when configured. +- Never emit raw request secrets or credentials in logs. ## Metrics -- Track the golden signals: latency, traffic, errors, saturation. -- Prefer histograms for latency; keep label cardinality low. + +In-process counters are exposed at `GET /metrics`: + +- `total_requests` +- `error_requests` +- `by_status` +- `by_path` +- `by_method` +- `uptime_seconds` +- `started_at` + +Use: + +```sh +curl -s http://localhost:8080/metrics | jq . +``` + +This can be scraped by Prometheus-compatible tooling or sampled by scripts for local checks. ## Traces -- Propagate trace context across service boundaries. -- Trace the critical paths (requests, background jobs) with stable span names. + +Distributed tracing is not yet implemented in this repository. If you add tracing, preserve the request correlation fields (`x-request-id` / `X-Request-ID`) as the minimum boundary signal. ## Health Checks -- Health checks are fast and deterministic; readiness reflects dependency availability when needed. -- Document expected status codes and what "unhealthy" means operationally. + +- `GET /health` returns `200` with policy identity and startup timestamp when service is ready. +- Failures show non-200 and error payloads without panicking side effects. + +Use: + +```sh +curl -i http://localhost:8080/health +``` + +## Event and decision introspection + +- Configure `DATAFOG_EVENTS_PATH` to emit NDJSON decision events. +- Query events through `GET /v1/events` with optional filters: + +```sh +curl 'http://localhost:8080/v1/events?limit=20&decision=deny' +curl 'http://localhost:8080/v1/events?adapter=claude&after=2026-02-24T00:00:00Z' +``` ## Agent Access -- Provide at least one concrete way to query each signal (logs, metrics, traces) without tribal knowledge. -- Include 1-2 copy-pastable examples per signal once the stack is known (commands, URLs, or queries). + +- Start by checking `/health` and `/metrics` after boot. +- Reproduce a request and inspect the returned `receipt_id`. +- Pull the immutable receipt: `GET /v1/receipts/{id}`. +- Confirm enforcement events with optional filtering from `/v1/events`. diff --git a/docs/PRODUCT_SENSE.md b/docs/PRODUCT_SENSE.md index 91a4238..62b3add 100644 --- a/docs/PRODUCT_SENSE.md +++ b/docs/PRODUCT_SENSE.md @@ -4,17 +4,28 @@ use_when: "Capturing target users, success outcomes, decision heuristics, and qu --- ## Target Users -- Name the primary user and the primary job-to-be-done; list any secondary users explicitly. -- Call out non-users (who this is not for) to reduce scope creep. + +- **Primary:** platform and security teams that need deterministic enforcement for AI agents and developer tooling. +- **Secondary:** developers integrating DataFog with local or remote CLIs (Claude, Codex, custom adapters), and SREs operating privacy controls in CI/CD environments. +- **Non-users:** teams seeking full SIEM/data lake analytics platform functionality; this repo is a policy and enforcement core, not a complete security platform. ## Key Outcomes -- Define 1-3 outcomes that matter and how you will measure them (even if qualitative). -- Prefer metrics tied to user time, reliability, and task completion. + +- **Trustworthy decisions:** deterministic policy outcomes for the same input/context. +- **Safe execution paths:** easy deployment path for command/file protection without hand-authored wrappers. +- **Auditable behavior:** every protected decision is traceable via receipt IDs, matched rules, and optional event logs. +- **Operator confidence:** clear status, error, and failure semantics so teams can operate under change. ## Decision Heuristics -- Prefer shipping a smaller, complete slice over a broad, partial feature. -- Optimize for reducing user effort and reducing operational burden. + +- Default to least-privilege: if policy/rule ambiguity exists, deny first. +- Prefer policy updates over code changes when behavior can be expressed declaratively in `policy.json`. +- Add behavior behind explicit CLI flags and env vars (`--observe`, `DATAFOG_SHIM_MODE`) rather than global defaults. +- Keep transform plans explicit and logged; avoid silent redactions. ## Quality Criteria -- Clear error messages and recovery paths; no silent failures. -- Sensible defaults and empty states; predictable navigation. + +- Clear errors with machine-readable `code` + human-readable `message`. +- Deterministic behavior for repeated requests under unchanged policy. +- Readable policy artifacts (`policy.json`, receipts) and easy rollback points. +- No silent mutation paths: all meaningful decisions should be traceable by `receipt_id` or events. diff --git a/docs/RELIABILITY.md b/docs/RELIABILITY.md index 385adee..0e0688b 100644 --- a/docs/RELIABILITY.md +++ b/docs/RELIABILITY.md @@ -5,58 +5,58 @@ use_when: "Capturing reliability goals, failure modes, monitoring, and operation ## Reliability goals (MVP) -- Primary flow (`POST /v1/scan`): 99.9% availability, p95 latency below 250ms at steady load. -- Policy and redaction consistency (`POST /v1/decide`, `POST /v1/transform`, `POST /v1/anonymize`): 99.5% availability, p95 latency below 350ms. -- Health signal (`GET /health`): 99.99% availability for readiness/liveness checks. +- Core API availability: target `99.9%` service availability. +- Primary decision path (`POST /v1/decide`) latency should remain low and stable under sustained load. +- `/health` should remain fast and dependable for liveness/readiness checks. -Definition of degraded: +Definition of degraded (for operational alerts): -- Availability below target for 5-minute windows. -- p95 latency sustained > 1.5x target for 10 minutes. -- Error rate > 1% for any public endpoint. +- Endpoint errors above normal baseline for 5+ minute windows. +- Repeated `429` bursts with no clear client-side remediation. +- Sustained startup failures or repeated process restarts. ## Failure Modes Top failures and controls: -- Policy file missing or invalid JSON: - - Signal: `policy_load_failed_total` increases, `/health` may degrade. - - Blast radius: all scan/decide/transform calls fail. - - Recovery: roll back to last known-good `config/policy.json`, fix schema, redeploy. +- **Invalid/missing policy file:** + - Signal: startup failure (process exits), `not found`/schema logs. + - Blast radius: full API path fails to start. + - Recovery: fix policy JSON, validate `policy_id`/`policy_version`, and restart. -- Receipt path write failure: - - Signal: request-level `receipt_write_failed` metric spikes, partial request successes. - - Blast radius: observability of decisions degrades first; policy logic still runs. - - Recovery: fix filesystem permissions, point to healthy `DATAFOG_RECEIPT_PATH`, restart. +- **Receipt persistence failure:** + - Signal: `/v1/decide` returns internal errors, repeated `receipt_error`. + - Blast radius: policy enforcement decisions may degrade as persistence is required. + - Recovery: verify `DATAFOG_RECEIPT_PATH` permissions and disk health, or relocate path. -- Rate limit configuration too low or malformed: - - Signal: sudden `429` rise and client-side retries. - - Blast radius: throughput reduction for bursty clients. - - Recovery: validate and tune `DATAFOG_RATE_LIMIT_RPS`, deploy config change. +- **Rate limit misconfiguration:** + - Signal: sudden `429` rise (`rate_limited`). + - Blast radius: throttling of legitimate traffic. + - Recovery: tune `DATAFOG_RATE_LIMIT_RPS` per environment profile. -- Bad deployment image or env drift: - - Signal: crash/restart loop, increased non-2xx responses. - - Blast radius: endpoint unavailability. - - Recovery: rollback image/version and redeploy after diff review. +- **Shutdown behavior:** + - Signal: long process termination, orphaned requests. + - Recovery: respect `DATAFOG_SHUTDOWN_TIMEOUT`; verify SIGTERM/SIGINT handling in runbook. ## Monitoring -Minimum signal set: +Minimum signal set (all from first-party endpoints): -- Error rate by endpoint and status code. -- p95/p99 latency per endpoint. -- `/health` pass/fail and startup duration. -- `DATAFOG_RATE_LIMIT_RPS` rejections. -- Receipt persistence success rate. +- Error rate by endpoint/path/status from `/metrics`. +- Request volume and traffic mix from `/metrics`. +- `GET /health` response time and status for readiness/liveness. +- Receipt file write errors in logs. -Alert rules: +Alerting should focus on: -- Page if SLO burn reaches 10% remaining over 10 minutes. -- Page on crash loop, persistent readiness failure, or error budget burn above threshold. -- Warn on sustained latency regression above 2x target for two consecutive intervals. +- sustained error-rate increases with low traffic baselines, +- process restart loops, +- blocked authentication spikes (`401`), +- persistent write failures or full disks on receipt/event paths. ## Operational Guardrails -- Keep configuration centralized and immutable per release (`policy`, env vars, receipt path). -- Every change must include a verified rollback command or known Git point-in-time for the container image and config map. -- Prefer controlled rollout with canaries for policy schema changes and rate-limit changes. +- Keep config explicit and versioned (`policy.json`, deployment manifests, env settings). +- Deploy policy changes through normal rollout controls (staging + canary when possible). +- Rotate credentials and tokens on incidents. +- Document rollback path before enabling policy or enforcement changes in production. diff --git a/docs/contracts/datafog-api-contract.md b/docs/contracts/datafog-api-contract.md index 8e7f800..5173635 100644 --- a/docs/contracts/datafog-api-contract.md +++ b/docs/contracts/datafog-api-contract.md @@ -55,6 +55,7 @@ If `DATAFOG_RATE_LIMIT_RPS` is greater than `0`, requests are subject to a servi - `hash_error` (500) - `receipt_error` (500) - `internal_error` (500) +- `events_read_error` (500) ## Endpoints @@ -137,7 +138,7 @@ Scans free text and returns deterministic findings. "trace_id": "string", "findings": [ { - "entity_type": "email|phone|ssn|api_key|credit_card", + "entity_type": "email|phone|ssn|api_key|credit_card|person|organization|location", "value": "string", "start": 0, "end": 5, @@ -196,7 +197,7 @@ Evaluates action policy against findings and returns a deterministic decision. "policy_id": "string", "matched_rules": ["rule_id"], "transform_plan": [ - { "entity_type": "email", "mode": "mask|tokenize|anonymize|redact" } + { "entity_type": "email", "mode": "mask|tokenize|anonymize|redact|replace|hash" } ], "findings": [ { @@ -221,7 +222,7 @@ Transforms text based on per-entity transforms. { "text": "string (required)", "findings": [], - "mode": "mask|tokenize|anonymize|redact", + "mode": "mask|tokenize|anonymize|redact|replace|hash", "entity_modes": { "email": "mask", "phone": "tokenize" @@ -230,11 +231,12 @@ Transforms text based on per-entity transforms. "trace_id": "optional correlation id", "idempotency_key": "optional key for replay-safe dedupe" } +``` -`transform` accepts only the documented modes (`mask`, `tokenize`, `anonymize`, `redact`) in both `mode` and `entity_modes` values. +`transform` accepts only documented modes (`mask`, `tokenize`, `anonymize`, `redact`, `replace`, `hash`) in both `mode` and `entity_modes` values. Invalid transform mode values result in `400` with `code: invalid_request`. `entity_modes` must not contain empty keys. -``` + #### Response 200 @@ -322,14 +324,67 @@ Returns persisted decision receipts. } ], "transform_plan": [ - { "entity_type": "email", "mode": "mask|tokenize|anonymize|redact" } + { "entity_type": "email", "mode": "mask|tokenize|anonymize|redact|replace|hash" } ], "reason": "optional" } ``` +### `GET /v1/events` + +Returns decision events when `DATAFOG_EVENTS_PATH` is configured (or another reader is set). + +Query params: + +- `limit` (`1..1000`, default `100`) +- `after` (RFC3339 timestamp) +- `before` (RFC3339 timestamp) +- `decision` (`allow|transform|allow_with_redaction|deny`) +- `adapter` (tool/adapter filter, e.g. `claude`, `codex`) + +```json +{ + "events": [ + { + "timestamp": "RFC3339 timestamp", + "mode": "enforced|observe", + "action_type": "string", + "tool": "string", + "resource": "string", + "command": "string", + "args": ["string"], + "sensitive": true, + "decision": "allow|transform|allow_with_redaction|deny", + "allowed": true, + "receipt_id": "string", + "matched_rules": ["string"], + "reason": "optional", + "check_error": "optional", + "request_id": "optional", + "trace_id": "optional" + } + ], + "total": 1 +} +``` + +If no events are configured or none match, return `{"events":[],"total":0}`. + ## Idempotency - Supported endpoints: `POST /v1/scan`, `POST /v1/decide`, `POST /v1/transform`, `POST /v1/anonymize`. - Replaying the same idempotency key and identical semantic payload returns the same status and body. - Reusing a key with different payloads returns `409` and `code: idempotency_conflict`. + +## Optional demo endpoints (if enabled) + +The following routes are only available when the server is started with `DATAFOG_ENABLE_DEMO` or `--enable-demo`: + +- `GET /demo` +- `POST /demo/exec` +- `POST /demo/write-file` +- `POST /demo/read-file` +- `POST /demo/seed` +- `GET /demo/sandbox` + +These return JSON payloads for execution and file operations and are intended for documentation/demo purposes only. diff --git a/docs/demo.html b/docs/demo.html index bad9b1e..095efa2 100644 --- a/docs/demo.html +++ b/docs/demo.html @@ -106,7 +106,7 @@

DataFog Scenario Explorer

{ id: 'shell-api-key', title: 'Agent leaks API key in shell command', - desc: 'An AI agent tries to run a git push, but the text it passes contains an API secret. The shim detects the key and blocks execution.', + desc: 'An AI agent tries to run a git push, but the text it passes contains an API secret. The policy gate detects the key and blocks execution.', tags: [['deny','deny'], ['entity','api_key']], steps: [ { @@ -119,7 +119,7 @@

DataFog Scenario Explorer

{ label: 'Step 2', title: 'DataFog scans for PII', - desc: 'The shim sends the payload to the scan engine before making a policy decision.', + desc: 'The policy gate sends the payload to the scan engine before making a policy decision.', type: 'scan', request: { text: 'Deploying with api_key=SK8xPw2mQ9nR4vT7yBcD3fGh to production' } }, @@ -132,8 +132,8 @@

DataFog Scenario Explorer

}, { label: 'Step 4', - title: 'Shim blocks execution', - desc: 'The command never runs. The agent receives a deny response with the matched rule.', + title: 'Policy gate blocks execution', + desc: 'The command never runs. The policy gate returns a deny response with the matched rule.', type: 'exec', request: { command: 'git', args: ['push','origin','main'], stdin: 'Deploying with api_key=SK8xPw2mQ9nR4vT7yBcD3fGh to production' } } @@ -142,7 +142,7 @@

DataFog Scenario Explorer

{ id: 'file-write-redact', title: 'File write with PII gets redacted', - desc: 'An agent writes a log file containing emails and SSNs. The shim detects PII and redacts it before the file hits disk.', + desc: 'An agent writes a log file containing emails and SSNs. The policy gate detects PII and redacts it before the file hits disk.', tags: [['redact','allow_with_redaction'], ['entity','email'], ['entity','ssn']], steps: [ { @@ -155,7 +155,7 @@

DataFog Scenario Explorer

{ label: 'Step 2', title: 'DataFog scans content', - desc: 'Before writing, the shim scans the file content for sensitive entities.', + desc: 'Before writing, the policy gate scans the file content for sensitive entities.', type: 'scan', request: { text: 'Customer: alice@example.com\nSSN: 123-45-6789\nPhone: 555-123-4567\nStatus: Active' } }, @@ -169,7 +169,7 @@

DataFog Scenario Explorer

{ label: 'Step 4', title: 'File written with redactions', - desc: 'The shim writes the file, but PII has been replaced. Compare original vs what hit disk.', + desc: 'The policy gate writes the file, but PII has been replaced. Compare original vs what hit disk.', type: 'write', request: { filename: 'summary.txt', content: 'Customer: alice@example.com\nSSN: 123-45-6789\nPhone: 555-123-4567\nStatus: Active' } } @@ -178,7 +178,7 @@

DataFog Scenario Explorer

{ id: 'safe-command', title: 'Clean command passes through', - desc: 'An agent runs a safe command with no PII. The shim allows it and returns the output.', + desc: 'An agent runs a safe command with no PII. The policy gate allows it and returns the output.', tags: [['allow','allow']], steps: [ { @@ -191,7 +191,7 @@

DataFog Scenario Explorer

{ label: 'Step 2', title: 'DataFog scans context', - desc: 'The shim scans the command context. No PII found.', + desc: 'The policy gate scans the command context. No PII found.', type: 'scan', request: { text: 'echo Hello from DataFog' } }, @@ -214,20 +214,20 @@

DataFog Scenario Explorer

{ id: 'file-read-redact', title: 'Reading a file redacts PII in output', - desc: 'A file on disk has PII. When the agent reads it through the shim, sensitive values are redacted in the returned content.', + desc: 'A file on disk has PII. When the agent reads it through the policy gate, sensitive values are redacted in the returned content.', tags: [['redact','allow_with_redaction'], ['entity','email'], ['entity','phone']], steps: [ { label: 'Step 1', title: 'Seed a file with PII', - desc: 'First, we write a raw file to the sandbox (bypassing the shim) so there is PII on disk to read back.', + desc: 'First, we write a raw file to the sandbox (bypassing the policy gate) so there is PII on disk to read back.', type: 'seed', request: { filename: 'contacts.txt', content: 'Bob: bob@corp.io / 415-555-0199\nInternal key: api_key=Abc1234567890123456' } }, { label: 'Step 2', title: 'Agent requests file read', - desc: 'The agent asks the shim to read contacts.txt.', + desc: 'The agent asks the policy gate to read contacts.txt.', type: 'info', display: { filename: 'contacts.txt', action: 'file.read' } }, @@ -524,10 +524,10 @@

${esc(s.title)}

} async function runSeedStep(req) { - // Write directly to sandbox, bypassing the shim gate — raw PII lands on disk + // Write directly to sandbox, bypassing the policy gate — raw PII lands on disk const { data, ms } = await apiCall('POST', '/demo/seed', req); if (data.seeded) { - return `
File ${esc(req.filename)} seeded in sandbox (bypassed shim — raw PII on disk)
+ return `
File ${esc(req.filename)} seeded in sandbox (bypassed policy gate — raw PII on disk)
${esc(req.content)}
${ms}ms
`; } diff --git a/docs/design-docs/core-beliefs.md b/docs/design-docs/core-beliefs.md index ec1057c..8fa3872 100644 --- a/docs/design-docs/core-beliefs.md +++ b/docs/design-docs/core-beliefs.md @@ -1,17 +1,21 @@ # Core Beliefs -Document the product and engineering beliefs that guide roadmap, architecture, and delivery decisions. +These beliefs guide roadmap and implementation decisions for DataFog. - +## Belief 1: Default to Safe Deny -## Belief 1 +- **Statement:** If policy is missing, ambiguous, or malformed, the system should fail closed. +- **Why it matters:** Privacy controls must not provide a false sense of safety through accidental permissive behavior. +- **Tradeoffs:** This can produce more friction for first-time users, but improves safety during policy rollout and migration. -- Statement: -- Why it matters: -- Tradeoffs: +## Belief 2: Determinism Is a Security Control -## Belief 2 +- **Statement:** The same input and policy snapshot must produce identical decisions and artifacts every time. +- **Why it matters:** Deterministic outcomes make policy testing, incident response, and audit easier. +- **Tradeoffs:** Some advanced heuristic techniques can be less predictable; those are only used when wrapped in explicit policy outcomes. -- Statement: -- Why it matters: -- Tradeoffs: +## Belief 3: Auditable by Default + +- **Statement:** Enforcement without trail is incomplete enforcement. +- **Why it matters:** Operators and auditors need `receipt_id`, matched rules, and evidence signals to validate behavior under pressure. +- **Tradeoffs:** Persistence adds storage and lifecycle overhead, but it is essential for trust and recovery. diff --git a/docs/design-docs/index.md b/docs/design-docs/index.md index ff2d88e..f4cca9e 100644 --- a/docs/design-docs/index.md +++ b/docs/design-docs/index.md @@ -1,8 +1,7 @@ # Design Docs Index -Design rationale and deep dives live here. +Design rationale and deep dives for this repository. ## Documents -- `core-beliefs.md` - +- `core-beliefs.md` — engineering and product beliefs that guide policy and roadmap choices. diff --git a/docs/runbooks/datafog-claude-agent-ux.md b/docs/runbooks/datafog-claude-agent-ux.md index e54843b..01fda55 100644 --- a/docs/runbooks/datafog-claude-agent-ux.md +++ b/docs/runbooks/datafog-claude-agent-ux.md @@ -27,7 +27,7 @@ The flow is: 1. Start policy service. 2. Run one bootstrap command. 3. Source one generated env file. -4. Keep PATH updated with shim directory. +4. Keep PATH updated with the policy-gate wrapper directory. This makes "I have policy-aware coding agent behavior" a predictable, low-friction sequence. @@ -35,7 +35,7 @@ This makes "I have policy-aware coding agent behavior" a predictable, low-fricti - `datafog-api` running and reachable (for example `http://localhost:8080`). - `claude` binary on PATH or available by absolute path. -- `go` installed for shim build (first run only). +- `go` installed for policy gate wrapper build (first run only). - Shell startup files are optional; dry-run mode can be used first. ## Fast setup flow @@ -50,7 +50,7 @@ chmod +x scripts/claude-datafog-setup.sh The script: - Builds `datafog-shim` (if needed), -- Installs a managed shim named `claude`, +- Installs a managed policy-gate wrapper named `claude`, - Writes a helper env file `~/.datafog/claude-datafog.env`, - Shows a minimal activation checklist. @@ -63,7 +63,7 @@ source ~/.datafog/claude-datafog.env export PATH="$HOME/.datafog/shims:$PATH" ``` -Expected behavior after this is that running `which claude` should resolve to the shim path in `~/.datafog/shims`. +Expected behavior after this is that running `which claude` should resolve to the managed policy-gate wrapper in `~/.datafog/shims`. ## Verification @@ -75,7 +75,7 @@ DATAFOG_SHIM_API_TOKEN="" claude --help With policy defaults in place, if there is a matching policy rule for the command action metadata: - allow/allow_with_redaction: command executes and emits decision info, -- deny/transform: command is blocked with a visible `PolicyDecisionError` in shim output. +- deny/transform: command is blocked with a visible `PolicyDecisionError` in policy-gate output. Audit evidence can be checked from sink: @@ -90,7 +90,7 @@ Expected NDJSON events include action type, tool `claude`, decision, and request - `--mode observe` for non-blocking rollout. - `--mode enforced` for hard blocking. - `--api-token` to enforce tokened policy API requests. -- `--install-git` to additionally gate `git` through the same shim family. +- `--install-git` to additionally gate `git` through the same policy-gate family. ## Recovery and escape hatch diff --git a/docs/runbooks/datafog-codex-agent-ux.md b/docs/runbooks/datafog-codex-agent-ux.md index 81ae3f9..f84a3bb 100644 --- a/docs/runbooks/datafog-codex-agent-ux.md +++ b/docs/runbooks/datafog-codex-agent-ux.md @@ -14,7 +14,7 @@ adds policy checkpoints to OpenAI Codex without changing how users run Codex day The end state is: - a normal `codex` command still works, -- every side-effect action passes through `datafog-shim` before execution, +- every side-effect action passes through `datafog-shim`'s policy gate before execution, - policy decisions and enforcement mode are explicit and reversible, - setup can be validated in under two minutes. @@ -27,7 +27,7 @@ The flow is intentionally: 1. Start policy service. 2. Run one bootstrap command. 3. Source one generated env file. -4. Keep PATH updated with shim dir. +4. Keep PATH updated with the policy-gate wrapper directory. This makes "I have policy-aware coding agent behavior" a predictable sequence rather than a long shell script to memorize. @@ -35,7 +35,7 @@ This makes "I have policy-aware coding agent behavior" a predictable sequence ra - `datafog-api` running and reachable (for example `http://localhost:8080`). - `codex` binary on PATH or available by absolute path. -- `go` installed for shim build (first run only). +- `go` installed for policy gate wrapper build (first run only). - Write access to shell startup files is optional: the runbook can stay in dry-run mode first. ## Fast setup flow @@ -50,7 +50,7 @@ chmod +x scripts/codex-datafog-setup.sh The script: - Builds `datafog-shim` (if needed), -- Installs a managed shim named `codex`, +- Installs a managed policy-gate wrapper named `codex`, - Writes a helper env file `~/.datafog/codex-datafog.env`, - Shows a minimal activation checklist. @@ -63,7 +63,7 @@ source ~/.datafog/codex-datafog.env export PATH="$HOME/.datafog/shims:$PATH" ``` -Expected behavior after this is that running `which codex` should resolve to the shim path in `~/.datafog/shims`. +Expected behavior after this is that running `which codex` resolves to the managed policy-gate wrapper in `~/.datafog/shims`. ## Verification @@ -75,7 +75,7 @@ DATAFOG_SHIM_API_TOKEN="" codex --help With policy defaults in place, if there is a matching policy rule for the command action metadata: - allow/allow_with_redaction: command executes and emits decision info, -- deny/transform: command is blocked with a visible `PolicyDecisionError` in the shim output. +- deny/transform: command is blocked with a visible `PolicyDecisionError` in the policy-gate output. Audit evidence can be checked by reading the configured sink: @@ -90,11 +90,11 @@ Expected NDJSON events include action type, tool `codex`, decision, and request - `--mode observe` for non-blocking rollout. - `--mode enforced` for hard blocking. - `--api-token` to enforce tokened policy API requests. -- `--install-git` to additionally gate `git` through the same shim family. +- `--install-git` to additionally gate `git` through the same policy-gate family. ## Recovery and escape hatch -If a user is blocked during onboarding, run the shim in observe mode to collect logs: +If a user is blocked during onboarding, run the policy gate in observe mode to collect logs: ```sh ./scripts/codex-datafog-setup.sh --policy-url http://localhost:8080 --mode observe From e3650d4fe94f7ffedf05f16ce0da5235bf0b2c20 Mon Sep 17 00:00:00 2001 From: Sid Mohan Date: Tue, 24 Feb 2026 21:36:33 -0800 Subject: [PATCH 2/7] Add perf benchmarks, pprof/fgprof, and policy evaluation normalization --- .github/workflows/main-cicd.yml | 19 ++++++ README.md | 52 +++++++++++---- cmd/datafog-api/main.go | 49 ++++++++++++++ docs/OBSERVABILITY.md | 13 ++++ go.mod | 4 ++ go.sum | 34 ++++++++++ internal/policy/policy.go | 84 ++++++++++++++---------- internal/policy/policy_bench_test.go | 60 +++++++++++++++++ internal/scan/scan_bench_test.go | 37 +++++++++++ internal/server/server.go | 3 +- internal/server/server_benchmark_test.go | 59 +++++++++++++++++ scripts/run-benchmarks.sh | 9 +++ 12 files changed, 376 insertions(+), 47 deletions(-) create mode 100644 go.sum create mode 100644 internal/policy/policy_bench_test.go create mode 100644 internal/scan/scan_bench_test.go create mode 100644 internal/server/server_benchmark_test.go create mode 100755 scripts/run-benchmarks.sh diff --git a/.github/workflows/main-cicd.yml b/.github/workflows/main-cicd.yml index 74fc5af..ae474f9 100644 --- a/.github/workflows/main-cicd.yml +++ b/.github/workflows/main-cicd.yml @@ -40,3 +40,22 @@ jobs: govulncheck ./... - name: Build container image run: docker build . --file Dockerfile --tag datafog-api:$(date +%s) + + benchmark: + runs-on: ubuntu-latest + needs: build + steps: + - uses: actions/checkout@v4 + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: "1.24.13" + - name: Run benchmark suites + run: | + mkdir -p /tmp/bench + go test -run '^$' -bench "BenchmarkScanText|BenchmarkPolicyEvaluate|BenchmarkDecideEndpoint|BenchmarkScanEndpoint" -benchmem ./internal/scan ./internal/policy ./internal/server | tee /tmp/bench/benchmark.txt + - name: Upload benchmark artifact + uses: actions/upload-artifact@v4 + with: + name: benchmark-results + path: /tmp/bench/benchmark.txt diff --git a/README.md b/README.md index 2725d27..b25e46e 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,12 @@ # DataFog -DataFog is a **policy-first Go service** for protecting sensitive data in AI agent and CLI workflows. +**The data firewall for agents and developer tools.** -It evaluates actions (for example: shell commands, file reads, and file writes) before they execute, and -returns a concrete enforcement decision. +DataFog is a runtime **data governance layer** for AI agents and developer tooling. + +It runs a single in-process policy loop: **detect → decide → enforce**. +For each payload crossing a process boundary (command execution, file read/write, or API action), +it detects sensitive entities, evaluates policy, and enforces the result before the action proceeds. This repo has two runtime pieces: @@ -14,11 +17,12 @@ The wrapper process is still named `datafog-shim` for compatibility, but we desc ## What DataFog does (technical) -1. **Scan text for sensitive entities** (`/v1/scan`). -2. **Evaluate one action against policy rules** (`/v1/decide`). -3. **Apply deterministic transforms** to detected entities (`/v1/transform`, `/v1/anonymize`). -4. **Emit an auditable receipt** for each decision (`/v1/receipts/{id}`). -5. **Optionally emit decision events** (`/v1/events`) when `DATAFOG_EVENTS_PATH` is set. +1. **Detect** sensitive entities in text and payload context (`/v1/scan`). +2. **Decide** using adapter-aware policy rules (`/v1/decide`) from `policy.json`. +3. **Enforce** the decision before execution (`allow`, `transform`, `allow_with_redaction`, or `deny`) in consuming runtimes. +4. **Transform or tokenize** matched data deterministically when a policy asks for it (`/v1/transform`, `/v1/anonymize`). +5. **Emit an auditable receipt** for every enforcement decision (`/v1/receipts/{id}`). +6. **Optionally emit decision events** (`/v1/events`) when `DATAFOG_EVENTS_PATH` is set. ## What it does not do @@ -29,10 +33,16 @@ The wrapper process is still named `datafog-shim` for compatibility, but we desc ## Use cases -- Stop risky AI/CLI actions before they run (for example: commands containing API keys). -- Enforce redaction on files created or read by tools. -- Build pre-commit/pre-execution guardrails for internal agents. -- Keep an audit trail for decisions in a local JSONL receipt file. +- Prevent sensitive data from crossing process boundaries before it leaves the machine (for example: a shell command exposing credentials or a script writing secret-bearing files). +- Enforce policy-specific transformations such as masking, tokenization, or redaction at runtime. +- Add pre-execution guardrails to AI agents and CLI workflows. +- Keep auditable receipts/events for every policy decision. + +## Positioning + +- **Developers and agent builders:** DataFog is a **privacy firewall for CLI tools and AI agents**. It sits in your PATH or runtime, inspects what is flowing through your commands, and enforces policy before data-sensitive actions execute. +- **Security/compliance buyers:** DataFog is runtime policy-as-code enforcement at the process level with receipts for every decision. +- **Broader view:** DataFog is the **data plane for agent governance** — detect, decide, enforce, and audit—not just “PII redaction.” ## Repository layout @@ -88,6 +98,8 @@ If you set `DATAFOG_API_TOKEN`, send it on every request using: | `DATAFOG_READ_HEADER_TIMEOUT` | `2s` | Request-header parse timeout | | `DATAFOG_IDLE_TIMEOUT` | `30s` | Idle keep-alive timeout | | `DATAFOG_SHUTDOWN_TIMEOUT` | `10s` | Graceful shutdown timeout | +| `DATAFOG_PPROF_ADDR` | *(unset)* | If set, starts optional profiling server on this address (example `localhost:6060`) | +| `DATAFOG_FGPROF` | `false` | Add `/debug/fgprof` endpoint to the profiling server | | `DATAFOG_ENABLE_DEMO` | *(unset)* | Enable `/demo*` endpoints | | `DATAFOG_DEMO_HTML` | `docs/demo.html` | Path to demo HTML | @@ -118,6 +130,19 @@ Optional demo routes (only when demo mode is enabled): - `POST /demo/seed` - `GET /demo/sandbox` +## Optional profiling endpoints + +For production debugging, set `DATAFOG_PPROF_ADDR` to run an auxiliary profiling server: + +- `/debug/pprof/` (standard net/http/pprof handlers: profiles, goroutines, heap, trace) +- `/debug/fgprof` when `DATAFOG_FGPROF=true` (low-overhead flame graph style profiler) + +Recommended values: + +- `DATAFOG_PPROF_ADDR=:6060` + +The profiling server is disabled by default and should be exposed only on trusted networks. + ## Decisions and idempotency Endpoints that accept `idempotency_key`: @@ -154,7 +179,7 @@ curl -X POST http://localhost:8080/v1/decide \ }' ``` -### Transform detected PII in text +### Transform detected sensitive data in text ```sh curl -X POST http://localhost:8080/v1/transform \ @@ -352,3 +377,4 @@ spec: 3. Environment variables are set and files are writable 4. API token/header if `DATAFOG_API_TOKEN` is configured 5. Policy JSON is valid and rules match expected action fields +6. Optional benchmark sweep: `scripts/run-benchmarks.sh` diff --git a/cmd/datafog-api/main.go b/cmd/datafog-api/main.go index 0e43fed..8d21a20 100644 --- a/cmd/datafog-api/main.go +++ b/cmd/datafog-api/main.go @@ -5,6 +5,7 @@ import ( "errors" "log" "net/http" + _ "net/http/pprof" "os" "os/signal" "strconv" @@ -12,6 +13,8 @@ import ( "syscall" "time" + "github.com/felixge/fgprof" + "github.com/datafog/datafog-api/internal/policy" "github.com/datafog/datafog-api/internal/receipts" "github.com/datafog/datafog-api/internal/server" @@ -27,6 +30,8 @@ func main() { shutdownTimeout := getenvDuration("DATAFOG_SHUTDOWN_TIMEOUT", 10*time.Second) enableDemo := getenv("DATAFOG_ENABLE_DEMO", "") != "" || hasFlag("--enable-demo") eventsPath := getenv("DATAFOG_EVENTS_PATH", "datafog_events.ndjson") + pprofAddr := getenv("DATAFOG_PPROF_ADDR", "") + fgprofEnabled := getenvBool("DATAFOG_FGPROF", false) policyData, err := policy.LoadPolicyFromFile(policyPath) if err != nil { @@ -62,6 +67,11 @@ func main() { handler = h.Handler() } + var pprofSrv *http.Server + if pprofAddr != "" { + pprofSrv = startProfilingServer(pprofAddr, fgprofEnabled, log.Default()) + } + srv := &http.Server{ Addr: addr, Handler: handler, @@ -93,6 +103,14 @@ func main() { ctx, cancel := context.WithTimeout(context.Background(), shutdownTimeout) defer cancel() + if pprofSrv != nil { + if err := pprofSrv.Shutdown(ctx); err != nil { + log.Printf("pprof server shutdown failed: %v", err) + if closeErr := pprofSrv.Close(); closeErr != nil && !errors.Is(closeErr, http.ErrServerClosed) { + log.Printf("pprof server forced close failed: %v", closeErr) + } + } + } if err := srv.Shutdown(ctx); err != nil { log.Printf("graceful shutdown failed: %v", err) if closeErr := srv.Close(); closeErr != nil && !errors.Is(closeErr, http.ErrServerClosed) { @@ -147,3 +165,34 @@ func hasFlag(flag string) bool { } return false } + +func getenvBool(key string, fallback bool) bool { + value := strings.TrimSpace(strings.ToLower(os.Getenv(key))) + switch value { + case "1", "true", "t", "yes", "y", "on": + return true + case "0", "false", "f", "no", "n", "off": + return false + } + return fallback +} + +func startProfilingServer(addr string, enableFGProf bool, logger *log.Logger) *http.Server { + mux := http.NewServeMux() + mux.Handle("/debug/pprof/", http.DefaultServeMux) + if enableFGProf { + mux.Handle("/debug/fgprof", fgprof.Handler()) + } + + srv := &http.Server{Addr: addr, Handler: mux} + go func() { + if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + logger.Printf("pprof server exited: %v", err) + } + }() + logger.Printf("pprof enabled at http://%s/debug/pprof/", addr) + if enableFGProf { + logger.Printf("fgprof enabled at http://%s/debug/fgprof", addr) + } + return srv +} diff --git a/docs/OBSERVABILITY.md b/docs/OBSERVABILITY.md index 5c7ffab..2472dbb 100644 --- a/docs/OBSERVABILITY.md +++ b/docs/OBSERVABILITY.md @@ -32,6 +32,19 @@ curl -s http://localhost:8080/metrics | jq . This can be scraped by Prometheus-compatible tooling or sampled by scripts for local checks. +## Profiling + +Optional runtime profiling is available when `DATAFOG_PPROF_ADDR` is set: + +- standard pprof at `/debug/pprof/` +- fgprof flamegraph endpoint at `/debug/fgprof` when `DATAFOG_FGPROF=true` + +```sh +curl -s http://localhost:6060/debug/pprof/heap?debug=1 | head +``` + +Keep profiling endpoints off public networks unless authenticated or otherwise isolated. + ## Traces Distributed tracing is not yet implemented in this repository. If you add tracing, preserve the request correlation fields (`x-request-id` / `X-Request-ID`) as the minimum boundary signal. diff --git a/go.mod b/go.mod index d3cf85f..9cf8474 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,7 @@ module github.com/datafog/datafog-api go 1.22 + +require github.com/felixge/fgprof v0.9.5 + +require github.com/google/pprof v0.0.0-20240227163752-401108e1b7e7 // indirect diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..69c0206 --- /dev/null +++ b/go.sum @@ -0,0 +1,34 @@ +github.com/chromedp/cdproto v0.0.0-20230802225258-3cf4e6d46a89/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= +github.com/chromedp/chromedp v0.9.2/go.mod h1:LkSXJKONWTCHAfQasKFUZI+mxqS4tZqhmtGzzhLsnLs= +github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww= +github.com/chzyer/logex v1.2.1/go.mod h1:JLbx6lG2kDbNRFnfkgvh4eRJRPX1QCoOIWomwysCBrQ= +github.com/chzyer/readline v1.5.1/go.mod h1:Eh+b79XXUwfKfcPLepksvw2tcLE/Ct21YObkaSkeBlk= +github.com/chzyer/test v1.0.0/go.mod h1:2JlltgoNkt4TW/z9V/IzDdFaMTM2JPIi26O1pF38GC8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/felixge/fgprof v0.9.5 h1:8+vR6yu2vvSKn08urWyEuxx75NWPEvybbkBirEpsbVY= +github.com/felixge/fgprof v0.9.5/go.mod h1:yKl+ERSa++RYOs32d8K6WEXCB4uXdLls4ZaZPpayhMM= +github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= +github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= +github.com/gobwas/ws v1.2.1/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= +github.com/google/pprof v0.0.0-20240227163752-401108e1b7e7 h1:y3N7Bm7Y9/CtpiVkw/ZWj6lSlDF3F74SfKwfTCer72Q= +github.com/google/pprof v0.0.0-20240227163752-401108e1b7e7/go.mod h1:czg5+yv1E0ZGTi6S6vVK1mke0fV+FaUhNGcd6VRS9Ik= +github.com/ianlancetaylor/demangle v0.0.0-20230524184225-eabc099b10ab/go.mod h1:gx7rwoVhcfuVKG5uya9Hs3Sxj7EIvldVofAWIUtGouw= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +golang.org/x/sys v0.0.0-20220310020820-b874c991c1a5/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/policy/policy.go b/internal/policy/policy.go index 5e2c7ec..c50f4c6 100644 --- a/internal/policy/policy.go +++ b/internal/policy/policy.go @@ -143,6 +143,27 @@ var allowedModes = map[models.TransformMode]struct{}{ models.TransformModeHash: {}, } +// NormalizeForEvaluation returns a copy of policy with rules sorted by priority descending. +// Higher priority rules are evaluated first. +func NormalizeForEvaluation(policy models.Policy) models.Policy { + rules := append([]models.Rule(nil), policy.Rules...) + sort.SliceStable(rules, func(i, j int) bool { + return rules[i].Priority > rules[j].Priority + }) + policy.Rules = rules + return policy +} + +// Evaluate evaluates a decision using policy rules. +// +// For hot paths, callers should use EvaluateSorted with a normalized policy +// (created by NormalizeForEvaluation) to avoid repeated sorting. +func Evaluate(policy models.Policy, ctx DecisionContext) DecisionResult { + return EvaluateSorted(NormalizeForEvaluation(policy), ctx) +} + +// EvaluateSorted evaluates policy decisions assuming rules are already sorted +// by priority descending. type DecisionContext struct { Action models.ActionMeta Findings []models.ScanFinding @@ -155,35 +176,7 @@ type DecisionResult struct { Reason string } -var defaultEntityTransforms = []models.TransformStep{ - {EntityType: "email", Mode: models.TransformModeMask}, - {EntityType: "phone", Mode: models.TransformModeTokenize}, - {EntityType: "ssn", Mode: models.TransformModeAnonymize}, - {EntityType: "api_key", Mode: models.TransformModeRedact}, - {EntityType: "credit_card", Mode: models.TransformModeRedact}, - {EntityType: "ip_address", Mode: models.TransformModeMask}, - {EntityType: "date", Mode: models.TransformModeMask}, - {EntityType: "zip_code", Mode: models.TransformModeMask}, - {EntityType: "person", Mode: models.TransformModeRedact}, - {EntityType: "organization", Mode: models.TransformModeMask}, - {EntityType: "location", Mode: models.TransformModeMask}, -} - -var defaultEntityTypes = map[string]struct{}{ - "email": {}, - "phone": {}, - "ssn": {}, - "api_key": {}, - "credit_card": {}, - "ip_address": {}, - "date": {}, - "zip_code": {}, - "person": {}, - "organization": {}, - "location": {}, -} - -func Evaluate(policy models.Policy, ctx DecisionContext) DecisionResult { +func EvaluateSorted(policy models.Policy, ctx DecisionContext) DecisionResult { if ctx.Action.Type == "" { return DecisionResult{ Decision: models.DecisionDeny, @@ -198,10 +191,7 @@ func Evaluate(policy models.Policy, ctx DecisionContext) DecisionResult { } } - rules := append([]models.Rule(nil), policy.Rules...) - sort.SliceStable(rules, func(i, j int) bool { - return rules[i].Priority > rules[j].Priority - }) + rules := policy.Rules hasFindings := map[string]struct{}{} for _, f := range ctx.Findings { @@ -286,6 +276,34 @@ func Evaluate(policy models.Policy, ctx DecisionContext) DecisionResult { } } +var defaultEntityTransforms = []models.TransformStep{ + {EntityType: "email", Mode: models.TransformModeMask}, + {EntityType: "phone", Mode: models.TransformModeTokenize}, + {EntityType: "ssn", Mode: models.TransformModeAnonymize}, + {EntityType: "api_key", Mode: models.TransformModeRedact}, + {EntityType: "credit_card", Mode: models.TransformModeRedact}, + {EntityType: "ip_address", Mode: models.TransformModeMask}, + {EntityType: "date", Mode: models.TransformModeMask}, + {EntityType: "zip_code", Mode: models.TransformModeMask}, + {EntityType: "person", Mode: models.TransformModeRedact}, + {EntityType: "organization", Mode: models.TransformModeMask}, + {EntityType: "location", Mode: models.TransformModeMask}, +} + +var defaultEntityTypes = map[string]struct{}{ + "email": {}, + "phone": {}, + "ssn": {}, + "api_key": {}, + "credit_card": {}, + "ip_address": {}, + "date": {}, + "zip_code": {}, + "person": {}, + "organization": {}, + "location": {}, +} + func matchAction(match models.MatchCriteria, requireSensitiveOnly bool, action models.ActionMeta) bool { if !matchesField(match.ActionTypes, action.Type) { return false diff --git a/internal/policy/policy_bench_test.go b/internal/policy/policy_bench_test.go new file mode 100644 index 0000000..31ffa3a --- /dev/null +++ b/internal/policy/policy_bench_test.go @@ -0,0 +1,60 @@ +package policy + +import ( + "testing" + + "github.com/datafog/datafog-api/internal/models" +) + +var benchmarkPolicy = models.Policy{ + PolicyID: "bench", + PolicyVersion: "v1", + Rules: []models.Rule{ + {ID: "allow-write", Effect: models.DecisionAllow, Match: models.MatchCriteria{ActionTypes: []string{"file.write"}}, Priority: 10}, + {ID: "transform-api-key", Effect: models.DecisionTransform, Match: models.MatchCriteria{ActionTypes: []string{"file.read"}}, EntityRequirements: []string{"api_key"}, Priority: 50}, + {ID: "transform-ssn", Effect: models.DecisionTransform, Match: models.MatchCriteria{ActionTypes: []string{"db.query"}}, EntityRequirements: []string{"ssn"}, Priority: 70}, + {ID: "deny-shell", Effect: models.DecisionDeny, Match: models.MatchCriteria{ActionTypes: []string{"shell.exec"}}, Priority: 100}, + {ID: "redact-person", Effect: models.DecisionAllowWithRedaction, Match: models.MatchCriteria{ActionTypes: []string{"chat.send"}}, EntityRequirements: []string{"person"}, Priority: 30}, + {ID: "allow-default", Effect: models.DecisionAllow, Match: models.MatchCriteria{ActionTypes: []string{"*"}}, Priority: 1}, + }, +} + +func BenchmarkPolicyEvaluateSorted(b *testing.B) { + p := NormalizeForEvaluation(benchmarkPolicy) + ctx := DecisionContext{ + Action: models.ActionMeta{ + Type: "shell.exec", + Tool: "bash", + Resource: "id_rsa", + }, + Findings: []models.ScanFinding{ + {EntityType: "api_key", Value: "AKIA...", Confidence: 0.95}, + {EntityType: "person", Value: "Ada Lovelace", Confidence: 0.9}, + }, + } + + b.ReportAllocs() + for i := 0; i < b.N; i++ { + _ = EvaluateSorted(p, ctx) + } +} + +func BenchmarkPolicyEvaluateUnsorted(b *testing.B) { + p := benchmarkPolicy + ctx := DecisionContext{ + Action: models.ActionMeta{ + Type: "shell.exec", + Tool: "bash", + Resource: "id_rsa", + }, + Findings: []models.ScanFinding{ + {EntityType: "api_key", Value: "AKIA...", Confidence: 0.95}, + {EntityType: "person", Value: "Ada Lovelace", Confidence: 0.9}, + }, + } + + b.ReportAllocs() + for i := 0; i < b.N; i++ { + _ = Evaluate(p, ctx) + } +} diff --git a/internal/scan/scan_bench_test.go b/internal/scan/scan_bench_test.go new file mode 100644 index 0000000..53efdc7 --- /dev/null +++ b/internal/scan/scan_bench_test.go @@ -0,0 +1,37 @@ +package scan + +import ( + "testing" +) + +var ( + smallText = "The email is alice@example.com and phone is +1 415-555-0199." + mediumText = "user=jane@example.com; token=AKIAIOSFODNN7EXAMPLE; card=4111 1111 1111 1111; api_key=abcd1234efgh5678ijkl; ssn=123-45-6789" + largeText = mediumText + " " + mediumText + " " + mediumText + " " + mediumText +) + +func BenchmarkScanTextSmall(b *testing.B) { + reportScannerPerf(b, smallText, nil) +} + +func BenchmarkScanTextMedium(b *testing.B) { + reportScannerPerf(b, mediumText, nil) +} + +func BenchmarkScanTextLarge(b *testing.B) { + reportScannerPerf(b, largeText, nil) +} + +func BenchmarkScanTextWithFilter(b *testing.B) { + reportScannerPerf(b, mediumText, []string{"email", "api_key", "credit_card"}) +} + +func reportScannerPerf(b *testing.B, text string, filter []string) { + b.Helper() + b.ReportAllocs() + b.SetBytes(int64(len(text))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = ScanText(text, filter) + } +} diff --git a/internal/server/server.go b/internal/server/server.go index 5393f3e..3c70304 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -95,6 +95,7 @@ func New(policyData models.Policy, store *receipts.ReceiptStore, logger *log.Log if logger == nil { logger = log.Default() } + policyData = policy.NormalizeForEvaluation(policyData) return &Server{ policy: policyData, store: store, @@ -421,7 +422,7 @@ func (s *Server) handleDecide(w http.ResponseWriter, r *http.Request) { if len(findings) == 0 && req.Text != "" { findings = scan.ScanText(req.Text, nil) } - result := policy.Evaluate(s.policy, policy.DecisionContext{Action: req.Action, Findings: findings}) + result := policy.EvaluateSorted(s.policy, policy.DecisionContext{Action: req.Action, Findings: findings}) actionHash, err := hashDecideAction(req.Action) if err != nil { s.respondError(w, http.StatusInternalServerError, models.APIError{Code: "hash_error", Message: "unable to hash action", Details: err.Error(), RequestID: requestID(r)}) diff --git a/internal/server/server_benchmark_test.go b/internal/server/server_benchmark_test.go new file mode 100644 index 0000000..318359f --- /dev/null +++ b/internal/server/server_benchmark_test.go @@ -0,0 +1,59 @@ +package server + +import ( + "io" + "log" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/datafog/datafog-api/internal/receipts" +) + +func BenchmarkScanEndpoint(b *testing.B) { + server := benchmarkServer(b) + payload := `{"text":"Email is jane@example.com and phone is +1 415 555 0199"}` + + b.ReportAllocs() + b.SetBytes(int64(len(payload))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + req := httptest.NewRequest(http.MethodPost, "/v1/scan", strings.NewReader(payload)) + req.Header.Set("Content-Type", "application/json") + + resp := httptest.NewRecorder() + server.Handler.ServeHTTP(resp, req) + if resp.Code != http.StatusOK { + b.Fatalf("expected 200, got %d", resp.Code) + } + } +} + +func BenchmarkDecideEndpoint(b *testing.B) { + server := benchmarkServer(b) + payload := `{"action":{"type":"file.read","resource":"notes.txt","tool":"cat","sensitive":false}}` + + b.ReportAllocs() + b.SetBytes(int64(len(payload))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + req := httptest.NewRequest(http.MethodPost, "/v1/decide", strings.NewReader(payload)) + req.Header.Set("Content-Type", "application/json") + + resp := httptest.NewRecorder() + server.Handler.ServeHTTP(resp, req) + if resp.Code != http.StatusOK { + b.Fatalf("expected 200, got %d", resp.Code) + } + } +} + +func benchmarkServer(b *testing.B) *http.Server { + store, err := receipts.NewReceiptStore(b.TempDir() + "/receipts.jsonl") + if err != nil { + b.Fatalf("new store: %v", err) + } + h := New(testPolicy(), store, log.New(io.Discard, "", 0), "", 0) + return &http.Server{Handler: h.Handler()} +} diff --git a/scripts/run-benchmarks.sh b/scripts/run-benchmarks.sh new file mode 100755 index 0000000..ea2a4e0 --- /dev/null +++ b/scripts/run-benchmarks.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -euo pipefail + +echo "Running API hot-path benchmark suite..." + +go test -run '^$' -bench 'BenchmarkScanText|BenchmarkPolicyEvaluate|BenchmarkDecideEndpoint|BenchmarkScanEndpoint' -benchmem ./internal/scan ./internal/policy ./internal/server | tee /tmp/benchmark-results.txt + +echo "Benchmark output written to /tmp/benchmark-results.txt" \ No newline at end of file From e19d62c0016e585b612a92f5d880de7aa213710b Mon Sep 17 00:00:00 2001 From: Sid Mohan Date: Tue, 24 Feb 2026 21:38:07 -0800 Subject: [PATCH 3/7] Add path latency metrics and race-detection CI job --- .github/workflows/main-cicd.yml | 12 ++++ README.md | 17 ++--- docs/OBSERVABILITY.md | 2 + internal/server/server.go | 113 ++++++++++++++++++++------------ internal/server/server_test.go | 6 ++ 5 files changed, 100 insertions(+), 50 deletions(-) diff --git a/.github/workflows/main-cicd.yml b/.github/workflows/main-cicd.yml index ae474f9..95cdcb2 100644 --- a/.github/workflows/main-cicd.yml +++ b/.github/workflows/main-cicd.yml @@ -59,3 +59,15 @@ jobs: with: name: benchmark-results path: /tmp/bench/benchmark.txt + + race: + runs-on: ubuntu-latest + needs: build + steps: + - uses: actions/checkout@v4 + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: "1.24.13" + - name: Run race detector + run: go test -race ./... diff --git a/README.md b/README.md index b25e46e..40c1775 100644 --- a/README.md +++ b/README.md @@ -40,9 +40,9 @@ The wrapper process is still named `datafog-shim` for compatibility, but we desc ## Positioning -- **Developers and agent builders:** DataFog is a **privacy firewall for CLI tools and AI agents**. It sits in your PATH or runtime, inspects what is flowing through your commands, and enforces policy before data-sensitive actions execute. -- **Security/compliance buyers:** DataFog is runtime policy-as-code enforcement at the process level with receipts for every decision. -- **Broader view:** DataFog is the **data plane for agent governance** — detect, decide, enforce, and audit—not just “PII redaction.” +- **Developers and agent builders:** DataFog is a **data-aware policy enforcement layer** for CLI tools and AI agents. It sits in your PATH or runtime, inspects data flowing through commands, and enforces policy before sensitive actions execute. +- **Security/compliance buyers:** DataFog maps closely to runtime DLP for developer workstations, but without the legacy footprint: policy is programmable (OPA-style), decision-aware, and process-bound. +- **Broader view:** DataFog is the **data plane for agent governance** — detect, decide, enforce, and audit. ## Repository layout @@ -373,8 +373,9 @@ spec: ## If something fails, check these first 1. `go test ./...` (build/runtime validation before changing policy) -2. `/health` response for policy id/version mismatch -3. Environment variables are set and files are writable -4. API token/header if `DATAFOG_API_TOKEN` is configured -5. Policy JSON is valid and rules match expected action fields -6. Optional benchmark sweep: `scripts/run-benchmarks.sh` +2. `go test -race ./...` (check race conditions on concurrency-sensitive paths) +3. `/health` response for policy id/version mismatch +4. Environment variables are set and files are writable +5. API token/header if `DATAFOG_API_TOKEN` is configured +6. Policy JSON is valid and rules match expected action fields +7. Optional benchmark sweep: `scripts/run-benchmarks.sh` diff --git a/docs/OBSERVABILITY.md b/docs/OBSERVABILITY.md index 2472dbb..eae0fcf 100644 --- a/docs/OBSERVABILITY.md +++ b/docs/OBSERVABILITY.md @@ -21,6 +21,8 @@ In-process counters are exposed at `GET /metrics`: - `by_status` - `by_path` - `by_method` +- `avg_latency_ms` +- `by_path_avg_latency_ms` - `uptime_seconds` - `started_at` diff --git a/internal/server/server.go b/internal/server/server.go index 3c70304..f10b6f1 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -27,24 +27,27 @@ import ( ) type Server struct { - policy models.Policy - store *receipts.ReceiptStore - eventReader shim.EventReader - apiToken string - rateLimiter *tokenBucket - startedAt time.Time - logger *log.Logger - mu sync.Mutex - statsMu sync.Mutex - decisions map[string]idempotentDecision - scans map[string]idempotentCachedResponse - transforms map[string]idempotentCachedResponse - anonymizes map[string]idempotentCachedResponse - totalCount int64 - errorCount int64 - statusHits map[int]int64 - pathHits map[string]int64 - methodHits map[string]int64 + policy models.Policy + store *receipts.ReceiptStore + eventReader shim.EventReader + apiToken string + rateLimiter *tokenBucket + startedAt time.Time + logger *log.Logger + mu sync.Mutex + statsMu sync.Mutex + decisions map[string]idempotentDecision + scans map[string]idempotentCachedResponse + transforms map[string]idempotentCachedResponse + anonymizes map[string]idempotentCachedResponse + totalCount int64 + errorCount int64 + statusHits map[int]int64 + pathHits map[string]int64 + methodHits map[string]int64 + totalLatencyNs int64 + pathLatencyNs map[string]int64 + pathLatencyCounts map[string]int64 } type requestIDContextKey struct{} @@ -82,13 +85,15 @@ type idempotentCachedResponse struct { } type metricsResponse struct { - TotalRequests int64 `json:"total_requests"` - ErrorRequests int64 `json:"error_requests"` - ByStatus map[string]int64 `json:"by_status"` - ByPath map[string]int64 `json:"by_path"` - ByMethod map[string]int64 `json:"by_method"` - StartedAt string `json:"started_at"` - UptimeSeconds float64 `json:"uptime_seconds"` + TotalRequests int64 `json:"total_requests"` + ErrorRequests int64 `json:"error_requests"` + ByStatus map[string]int64 `json:"by_status"` + ByPath map[string]int64 `json:"by_path"` + ByMethod map[string]int64 `json:"by_method"` + ByPathLatency map[string]float64 `json:"by_path_avg_latency_ms"` + AvgLatencyMs float64 `json:"avg_latency_ms"` + StartedAt string `json:"started_at"` + UptimeSeconds float64 `json:"uptime_seconds"` } func New(policyData models.Policy, store *receipts.ReceiptStore, logger *log.Logger, apiToken string, rateLimitRPS int) *Server { @@ -97,19 +102,22 @@ func New(policyData models.Policy, store *receipts.ReceiptStore, logger *log.Log } policyData = policy.NormalizeForEvaluation(policyData) return &Server{ - policy: policyData, - store: store, - apiToken: apiToken, - rateLimiter: newTokenBucket(rateLimitRPS), - startedAt: time.Now().UTC(), - logger: logger, - decisions: map[string]idempotentDecision{}, - scans: map[string]idempotentCachedResponse{}, - transforms: map[string]idempotentCachedResponse{}, - anonymizes: map[string]idempotentCachedResponse{}, - statusHits: map[int]int64{}, - pathHits: map[string]int64{}, - methodHits: map[string]int64{}, + policy: policyData, + store: store, + apiToken: apiToken, + rateLimiter: newTokenBucket(rateLimitRPS), + startedAt: time.Now().UTC(), + logger: logger, + decisions: map[string]idempotentDecision{}, + scans: map[string]idempotentCachedResponse{}, + transforms: map[string]idempotentCachedResponse{}, + anonymizes: map[string]idempotentCachedResponse{}, + statusHits: map[int]int64{}, + pathHits: map[string]int64{}, + methodHits: map[string]int64{}, + totalLatencyNs: 0, + pathLatencyNs: map[string]int64{}, + pathLatencyCounts: map[string]int64{}, } } @@ -164,6 +172,7 @@ func (s *Server) wrapMiddleware(mux *http.ServeMux) http.Handler { startedAt := time.Now() handler, pattern := mux.Handler(r) defer func() { + latency := time.Since(startedAt) if rec := recover(); rec != nil { responseWriter.status = http.StatusInternalServerError s.logger.Printf("request panic request_id=%s method=%s path=%s err=%v", reqID, r.Method, r.URL.Path, rec) @@ -173,11 +182,11 @@ func (s *Server) wrapMiddleware(mux *http.ServeMux) http.Handler { responseWriter.status = http.StatusOK } if pattern == "" { - s.recordRequestMetrics(r.Method, "/_not_found", responseWriter.status) + s.recordRequestMetrics(r.Method, "/_not_found", responseWriter.status, latency) } else { - s.recordRequestMetrics(r.Method, canonicalizedRoute(pattern, r.URL.Path), responseWriter.status) + s.recordRequestMetrics(r.Method, canonicalizedRoute(pattern, r.URL.Path), responseWriter.status, latency) } - s.logger.Printf("request complete request_id=%s method=%s path=%s status=%d latency_ms=%d", reqID, r.Method, r.URL.Path, responseWriter.status, time.Since(startedAt).Milliseconds()) + s.logger.Printf("request complete request_id=%s method=%s path=%s status=%d latency_ms=%d", reqID, r.Method, r.URL.Path, responseWriter.status, latency.Milliseconds()) }() if !s.authorized(r) { @@ -275,7 +284,7 @@ func canonicalizedRoute(pattern string, path string) string { return pattern } -func (s *Server) recordRequestMetrics(method string, route string, status int) { +func (s *Server) recordRequestMetrics(method string, route string, status int, latency time.Duration) { s.statsMu.Lock() defer s.statsMu.Unlock() s.totalCount++ @@ -285,6 +294,11 @@ func (s *Server) recordRequestMetrics(method string, route string, status int) { if status >= 400 { s.errorCount++ } + + latencyNs := latency.Nanoseconds() + s.totalLatencyNs += latencyNs + s.pathLatencyNs[route] += latencyNs + s.pathLatencyCounts[route]++ } func (s *Server) handleHealth(w http.ResponseWriter, r *http.Request) { @@ -740,12 +754,27 @@ func (s *Server) snapshotMetrics() metricsResponse { byMethod[method] = count } + byPathLatency := map[string]float64{} + for path, count := range s.pathLatencyCounts { + if count == 0 { + continue + } + byPathLatency[path] = float64(s.pathLatencyNs[path]) / float64(count) / float64(time.Millisecond) + } + + avgLatencyMs := 0.0 + if s.totalCount > 0 { + avgLatencyMs = float64(s.totalLatencyNs) / float64(s.totalCount) / float64(time.Millisecond) + } + return metricsResponse{ TotalRequests: s.totalCount, ErrorRequests: s.errorCount, ByStatus: byStatus, ByPath: byPath, ByMethod: byMethod, + ByPathLatency: byPathLatency, + AvgLatencyMs: avgLatencyMs, StartedAt: s.startedAt.Format(time.RFC3339), UptimeSeconds: time.Since(s.startedAt).Seconds(), } diff --git a/internal/server/server_test.go b/internal/server/server_test.go index 0609f61..cf25b91 100644 --- a/internal/server/server_test.go +++ b/internal/server/server_test.go @@ -887,6 +887,12 @@ func TestMetricsEndpoint(t *testing.T) { if got.ByPath["/_not_found"] != 1 { t.Fatalf("expected /_not_found to be tracked once, got %d", got.ByPath["/_not_found"]) } + if got.AvgLatencyMs <= 0 { + t.Fatalf("expected positive average latency, got %f", got.AvgLatencyMs) + } + if got.ByPathLatency["/health"] <= 0 { + t.Fatalf("expected health average latency to be tracked, got %f", got.ByPathLatency["/health"]) + } if _, err := time.Parse(time.RFC3339, got.StartedAt); err != nil { t.Fatalf("expected started_at to be RFC3339, got %q", got.StartedAt) } From ff3e02842eaf597293b84339e9f0eec8d0602031 Mon Sep 17 00:00:00 2001 From: Sid Mohan Date: Tue, 24 Feb 2026 21:40:29 -0800 Subject: [PATCH 4/7] Optimize scan pipeline by caching filter set and skipping NER when not needed --- internal/scan/detector.go | 136 +++++++++++++++++++++++--------------- internal/scan/filters.go | 49 ++++++++++++++ internal/scan/ner.go | 22 +++--- 3 files changed, 145 insertions(+), 62 deletions(-) create mode 100644 internal/scan/filters.go diff --git a/internal/scan/detector.go b/internal/scan/detector.go index 7b6401c..93dfc3c 100644 --- a/internal/scan/detector.go +++ b/internal/scan/detector.go @@ -2,9 +2,6 @@ package scan import ( "regexp" - "sort" - "strconv" - "strings" "github.com/datafog/datafog-api/internal/models" ) @@ -67,53 +64,52 @@ var DefaultEntityConfidences = map[string]float64{ "zip_code": 0.80, } +var defaultScanEntityTypes = []string{ + "api_key", + "credit_card", + "date", + "email", + "ip_address", + "phone", + "ssn", + "zip_code", +} + func ScanText(text string, entityFilter []string) []models.ScanFinding { - requested := map[string]struct{}{} - if len(entityFilter) > 0 { - for _, name := range entityFilter { - requested[strings.ToLower(strings.TrimSpace(name))] = struct{}{} - } - } + requested := requestedEntitySet(entityFilter) findings := make([]models.ScanFinding, 0) // Phase 1: Regex engine (fast, always available) - entityTypes := make([]string, 0, len(DefaultEntityPatterns)) - for entityType := range DefaultEntityPatterns { - entityTypes = append(entityTypes, entityType) - } - sort.Strings(entityTypes) - - for _, entityType := range entityTypes { - pattern := DefaultEntityPatterns[entityType] - if len(requested) > 0 { - if _, ok := requested[entityType]; !ok { - continue + for _, entityType := range defaultScanEntityTypes { + if shouldRunEntityType(requested, entityType) { + pattern := DefaultEntityPatterns[entityType] + idxs := pattern.Re.FindAllStringIndex(text, -1) + for _, idx := range idxs { + if len(idx) != 2 || idx[0] < 0 || idx[1] < idx[0] { + continue + } + value := text[idx[0]:idx[1]] + if pattern.Validate != nil && !pattern.Validate(value) { + continue + } + findings = append(findings, models.ScanFinding{ + EntityType: entityType, + Value: value, + Start: idx[0], + End: idx[1], + Confidence: DefaultEntityConfidences[entityType], + }) } } + } - idxs := pattern.Re.FindAllStringIndex(text, -1) - for _, idx := range idxs { - if len(idx) != 2 || idx[0] < 0 || idx[1] < idx[0] { - continue - } - value := text[idx[0]:idx[1]] - if pattern.Validate != nil && !pattern.Validate(value) { - continue - } - findings = append(findings, models.ScanFinding{ - EntityType: entityType, - Value: value, - Start: idx[0], - End: idx[1], - Confidence: DefaultEntityConfidences[entityType], - }) - } + if !shouldRunNERForFilter(requested) { + return findings } // Phase 2: NER engine (heuristic, when enabled) - nerFindings := ScanNER(text, entityFilter) - findings = append(findings, nerFindings...) + findings = append(findings, scanNERWithFilter(text, requested)...) return findings } @@ -121,24 +117,30 @@ func ScanText(text string, entityFilter []string) []models.ScanFinding { // luhnValid implements the Luhn algorithm to validate credit card numbers. // It strips spaces and dashes before checking. func luhnValid(s string) bool { - // Strip spaces and dashes - var digits []int + var digits [19]int + n := 0 + for _, ch := range s { - if ch >= '0' && ch <= '9' { - digits = append(digits, int(ch-'0')) - } else if ch == ' ' || ch == '-' { + switch { + case ch >= '0' && ch <= '9': + if n == len(digits) { + return false + } + digits[n] = int(ch - '0') + n++ + case ch == ' ' || ch == '-': continue - } else { + default: return false } } - if len(digits) < 13 || len(digits) > 19 { + if n < 13 || n > 19 { return false } sum := 0 double := false - for i := len(digits) - 1; i >= 0; i-- { + for i := n - 1; i >= 0; i-- { d := digits[i] if double { d *= 2 @@ -154,18 +156,44 @@ func luhnValid(s string) bool { // ipv4Valid checks that each octet is 0-255. func ipv4Valid(s string) bool { - parts := strings.Split(s, ".") - if len(parts) != 4 { + if len(s) == 0 { return false } - for _, part := range parts { - n, err := strconv.Atoi(part) - if err != nil { + + octetCount := 0 + value := 0 + digitsInOctet := 0 + + for i := 0; i <= len(s); i++ { + if i == len(s) || s[i] == '.' { + if digitsInOctet == 0 { + return false + } + if value < 0 || value > 255 { + return false + } + octetCount++ + if octetCount > 4 { + return false + } + if i == len(s) { + break + } + value = 0 + digitsInOctet = 0 + continue + } + + ch := s[i] + if ch < '0' || ch > '9' { return false } - if n < 0 || n > 255 { + value = value*10 + int(ch-'0') + digitsInOctet++ + if digitsInOctet > 3 { return false } } - return true + + return octetCount == 4 } diff --git a/internal/scan/filters.go b/internal/scan/filters.go new file mode 100644 index 0000000..8af54db --- /dev/null +++ b/internal/scan/filters.go @@ -0,0 +1,49 @@ +package scan + +import "strings" + +func requestedEntitySet(entityFilter []string) map[string]struct{} { + if len(entityFilter) == 0 { + return nil + } + requested := make(map[string]struct{}, len(entityFilter)) + for _, name := range entityFilter { + entity := strings.ToLower(strings.TrimSpace(name)) + if entity == "" { + continue + } + requested[entity] = struct{}{} + } + if len(requested) == 0 { + return nil + } + return requested +} + +func shouldRunEntityType(requested map[string]struct{}, entityType string) bool { + if len(requested) == 0 { + return true + } + _, ok := requested[entityType] + return ok +} + +func shouldRunEntity(requested map[string]struct{}, entityType string) bool { + return shouldRunEntityType(requested, entityType) +} + +func shouldRunNERForFilter(requested map[string]struct{}) bool { + if len(requested) == 0 { + return true + } + if _, ok := requested["person"]; ok { + return true + } + if _, ok := requested["organization"]; ok { + return true + } + if _, ok := requested["location"]; ok { + return true + } + return false +} diff --git a/internal/scan/ner.go b/internal/scan/ner.go index 9e5113f..673a4a7 100644 --- a/internal/scan/ner.go +++ b/internal/scan/ner.go @@ -126,16 +126,22 @@ func ScanNER(text string, entityFilter []string) []models.ScanFinding { return nil } - requested := map[string]struct{}{} - if len(entityFilter) > 0 { - for _, name := range entityFilter { - requested[strings.ToLower(strings.TrimSpace(name))] = struct{}{} - } + requested := requestedEntitySet(entityFilter) + return scanNERWithFilter(text, requested) +} + +func scanNERWithFilter(text string, requested map[string]struct{}) []models.ScanFinding { + if !NEREnabled { + return nil + } + + if len(requested) > 0 && !shouldRunNERForFilter(requested) { + return nil } - wantPerson := len(requested) == 0 || hasKey(requested, "person") - wantOrg := len(requested) == 0 || hasKey(requested, "organization") - wantLoc := len(requested) == 0 || hasKey(requested, "location") + wantPerson := shouldRunEntity(requested, "person") + wantOrg := shouldRunEntity(requested, "organization") + wantLoc := shouldRunEntity(requested, "location") findings := make([]models.ScanFinding, 0) From 02e23649493e23bf584cb4d1d4db25856e9dc61f Mon Sep 17 00:00:00 2001 From: Sid Mohan Date: Tue, 24 Feb 2026 21:55:22 -0800 Subject: [PATCH 5/7] Add automaxprocs tuning and goleak leak checks in CLI packages --- README.md | 1 + cmd/datafog-api/main.go | 18 ++++++++++++++++++ cmd/datafog-api/main_test.go | 14 ++++++++++++++ cmd/datafog-shim/main_test.go | 5 +++++ go.mod | 6 +++++- go.sum | 6 ++++++ 6 files changed, 49 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 40c1775..e17d9c4 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,7 @@ If you set `DATAFOG_API_TOKEN`, send it on every request using: | `DATAFOG_READ_HEADER_TIMEOUT` | `2s` | Request-header parse timeout | | `DATAFOG_IDLE_TIMEOUT` | `30s` | Idle keep-alive timeout | | `DATAFOG_SHUTDOWN_TIMEOUT` | `10s` | Graceful shutdown timeout | +| `GOMAXPROCS` | *(runtime default)* | Auto-tuned at startup to detected CPU limit; set explicitly to override | | `DATAFOG_PPROF_ADDR` | *(unset)* | If set, starts optional profiling server on this address (example `localhost:6060`) | | `DATAFOG_FGPROF` | `false` | Add `/debug/fgprof` endpoint to the profiling server | | `DATAFOG_ENABLE_DEMO` | *(unset)* | Enable `/demo*` endpoints | diff --git a/cmd/datafog-api/main.go b/cmd/datafog-api/main.go index 8d21a20..72ad11d 100644 --- a/cmd/datafog-api/main.go +++ b/cmd/datafog-api/main.go @@ -14,6 +14,7 @@ import ( "time" "github.com/felixge/fgprof" + "go.uber.org/automaxprocs/maxprocs" "github.com/datafog/datafog-api/internal/policy" "github.com/datafog/datafog-api/internal/receipts" @@ -22,6 +23,9 @@ import ( ) func main() { + revertAuto := configureMaxProcs(log.Default()) + defer revertAuto() + policyPath := getenv("DATAFOG_POLICY_PATH", "config/policy.json") receiptPath := getenv("DATAFOG_RECEIPT_PATH", "datafog_receipts.jsonl") apiToken := getenv("DATAFOG_API_TOKEN", "") @@ -177,6 +181,20 @@ func getenvBool(key string, fallback bool) bool { return fallback } +func configureMaxProcs(logger *log.Logger) func() { + if logger == nil { + logger = log.Default() + } + undo, err := maxprocs.Set(maxprocs.Logger(func(format string, args ...interface{}) { + logger.Printf(format, args...) + })) + if err != nil { + logger.Printf("maxprocs configuration skipped: %v", err) + return func() {} + } + return undo +} + func startProfilingServer(addr string, enableFGProf bool, logger *log.Logger) *http.Server { mux := http.NewServeMux() mux.Handle("/debug/pprof/", http.DefaultServeMux) diff --git a/cmd/datafog-api/main_test.go b/cmd/datafog-api/main_test.go index b3c26ab..1d8294f 100644 --- a/cmd/datafog-api/main_test.go +++ b/cmd/datafog-api/main_test.go @@ -3,8 +3,22 @@ package main import ( "testing" "time" + + "go.uber.org/goleak" ) +func TestMain(m *testing.M) { + goleak.VerifyTestMain(m) +} + +func TestConfigureMaxProcs(t *testing.T) { + revert := configureMaxProcs(nil) + if revert == nil { + t.Fatalf("expected configureMaxProcs to return a revert function") + } + revert() +} + func TestGetenvDuration(t *testing.T) { t.Run("fallback_when_missing", func(t *testing.T) { t.Setenv("DATAFOG_READ_TIMEOUT", "") diff --git a/cmd/datafog-shim/main_test.go b/cmd/datafog-shim/main_test.go index 7d09228..d32b368 100644 --- a/cmd/datafog-shim/main_test.go +++ b/cmd/datafog-shim/main_test.go @@ -8,8 +8,13 @@ import ( "testing" "github.com/datafog/datafog-api/internal/shim" + "go.uber.org/goleak" ) +func TestMain(m *testing.M) { + goleak.VerifyTestMain(m) +} + func TestParseMode(t *testing.T) { t.Run("default", func(t *testing.T) { mode, err := parseMode("") diff --git a/go.mod b/go.mod index 9cf8474..2028ce1 100644 --- a/go.mod +++ b/go.mod @@ -2,6 +2,10 @@ module github.com/datafog/datafog-api go 1.22 -require github.com/felixge/fgprof v0.9.5 +require ( + github.com/felixge/fgprof v0.9.5 + go.uber.org/automaxprocs v1.6.0 + go.uber.org/goleak v1.3.0 +) require github.com/google/pprof v0.0.0-20240227163752-401108e1b7e7 // indirect diff --git a/go.sum b/go.sum index 69c0206..e80acc3 100644 --- a/go.sum +++ b/go.sum @@ -21,11 +21,17 @@ github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJ github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= +github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= golang.org/x/sys v0.0.0-20220310020820-b874c991c1a5/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= From 8dffac0e848c079dfd367daa9cf20aed998e1b6e Mon Sep 17 00:00:00 2001 From: Sid Mohan Date: Tue, 24 Feb 2026 21:57:13 -0800 Subject: [PATCH 6/7] Add benchmark trend reporting with benchstat baseline --- .github/workflows/main-cicd.yml | 11 +++++----- README.md | 2 +- scripts/benchmark-baseline.txt | 26 ++++++++++++++++++++++ scripts/run-benchmarks.sh | 38 +++++++++++++++++++++++++++++++-- 4 files changed, 69 insertions(+), 8 deletions(-) create mode 100644 scripts/benchmark-baseline.txt diff --git a/.github/workflows/main-cicd.yml b/.github/workflows/main-cicd.yml index 95cdcb2..95368bf 100644 --- a/.github/workflows/main-cicd.yml +++ b/.github/workflows/main-cicd.yml @@ -50,15 +50,16 @@ jobs: uses: actions/setup-go@v5 with: go-version: "1.24.13" - - name: Run benchmark suites + - name: Run benchmark suites with trend report run: | - mkdir -p /tmp/bench - go test -run '^$' -bench "BenchmarkScanText|BenchmarkPolicyEvaluate|BenchmarkDecideEndpoint|BenchmarkScanEndpoint" -benchmem ./internal/scan ./internal/policy ./internal/server | tee /tmp/bench/benchmark.txt - - name: Upload benchmark artifact + bash scripts/run-benchmarks.sh + - name: Upload benchmark artifacts uses: actions/upload-artifact@v4 with: name: benchmark-results - path: /tmp/bench/benchmark.txt + path: | + /tmp/bench/benchmark-current.txt + /tmp/bench/benchmark-trend.txt race: runs-on: ubuntu-latest diff --git a/README.md b/README.md index e17d9c4..018b205 100644 --- a/README.md +++ b/README.md @@ -379,4 +379,4 @@ spec: 4. Environment variables are set and files are writable 5. API token/header if `DATAFOG_API_TOKEN` is configured 6. Policy JSON is valid and rules match expected action fields -7. Optional benchmark sweep: `scripts/run-benchmarks.sh` +7. Optional benchmark sweep: `scripts/run-benchmarks.sh` (writes `/tmp/bench/benchmark-current.txt`; if `scripts/benchmark-baseline.txt` exists, also writes `/tmp/bench/benchmark-trend.txt` with benchstat deltas) diff --git a/scripts/benchmark-baseline.txt b/scripts/benchmark-baseline.txt new file mode 100644 index 0000000..536943f --- /dev/null +++ b/scripts/benchmark-baseline.txt @@ -0,0 +1,26 @@ +goos: darwin +goarch: arm64 +pkg: github.com/datafog/datafog-api/internal/scan +cpu: Apple M3 +BenchmarkScanTextSmall-8 64513 16559 ns/op 3.62 MB/s 6881 B/op 51 allocs/op +BenchmarkScanTextMedium-8 38298 30559 ns/op 3.99 MB/s 12209 B/op 54 allocs/op +BenchmarkScanTextLarge-8 6988 170732 ns/op 2.88 MB/s 158424 B/op 193 allocs/op +BenchmarkScanTextWithFilter-8 144634 8321 ns/op 14.66 MB/s 1444 B/op 12 allocs/op +PASS +ok github.com/datafog/datafog-api/internal/scan 5.508s +goos: darwin +goarch: arm64 +pkg: github.com/datafog/datafog-api/internal/policy +cpu: Apple M3 +BenchmarkPolicyEvaluateSorted-8 7168506 154.2 ns/op 16 B/op 1 allocs/op +BenchmarkPolicyEvaluateUnsorted-8 2565021 470.7 ns/op 2136 B/op 5 allocs/op +PASS +ok github.com/datafog/datafog-api/internal/policy 3.144s +goos: darwin +goarch: arm64 +pkg: github.com/datafog/datafog-api/internal/server +cpu: Apple M3 +BenchmarkScanEndpoint-8 58554 19254 ns/op 3.38 MB/s 15734 B/op 99 allocs/op +BenchmarkDecideEndpoint-8 405 2923160 ns/op 0.03 MB/s 11742 B/op 67 allocs/op +PASS +ok github.com/datafog/datafog-api/internal/server 3.022s diff --git a/scripts/run-benchmarks.sh b/scripts/run-benchmarks.sh index ea2a4e0..d3e05ca 100755 --- a/scripts/run-benchmarks.sh +++ b/scripts/run-benchmarks.sh @@ -2,8 +2,42 @@ set -euo pipefail +BENCH_PATTERN='BenchmarkScanText|BenchmarkPolicyEvaluate|BenchmarkDecideEndpoint|BenchmarkScanEndpoint' +OUTPUT_DIR="${DATAFOG_BENCH_OUTPUT_DIR:-/tmp/bench}" +CURRENT_FILE="${DATAFOG_BENCH_CURRENT_FILE:-$OUTPUT_DIR/benchmark-current.txt}" +TREND_FILE="${DATAFOG_BENCH_TREND_FILE:-$OUTPUT_DIR/benchmark-trend.txt}" +BASELINE_FILE="${DATAFOG_BENCH_BASELINE_FILE:-$(pwd)/scripts/benchmark-baseline.txt}" + +mkdir -p "$OUTPUT_DIR" + echo "Running API hot-path benchmark suite..." +go test -run '^$' -bench "$BENCH_PATTERN" -benchmem ./internal/scan ./internal/policy ./internal/server | tee "$CURRENT_FILE" + +echo "Benchmark output written to $CURRENT_FILE" + +if [ -f "$BASELINE_FILE" ]; then + echo "Comparing against baseline: $BASELINE_FILE" + + BENCHSTAT_BIN="" + if command -v benchstat >/dev/null 2>&1; then + BENCHSTAT_BIN="$(command -v benchstat)" + else + if ! go install golang.org/x/perf/cmd/benchstat@latest >/dev/null; then + echo "warn: unable to install benchstat; skipping trend report" + exit 0 + fi -go test -run '^$' -bench 'BenchmarkScanText|BenchmarkPolicyEvaluate|BenchmarkDecideEndpoint|BenchmarkScanEndpoint' -benchmem ./internal/scan ./internal/policy ./internal/server | tee /tmp/benchmark-results.txt + GO_BIN="$(go env GOPATH)/bin/benchstat" + if [ -x "$GO_BIN" ]; then + BENCHSTAT_BIN="$GO_BIN" + else + echo "warn: benchstat not available after installation; skipping trend report" + exit 0 + fi + fi -echo "Benchmark output written to /tmp/benchmark-results.txt" \ No newline at end of file + "$BENCHSTAT_BIN" "$BASELINE_FILE" "$CURRENT_FILE" | tee "$TREND_FILE" + echo "Trend report written to $TREND_FILE" +else + echo "No baseline found at $BASELINE_FILE, skipping trend comparison" +fi From 2b47d3d741a948a9a6e53bff9e8b349f4143a62c Mon Sep 17 00:00:00 2001 From: Sid Mohan Date: Tue, 24 Feb 2026 22:26:47 -0800 Subject: [PATCH 7/7] Complete datafog-shim policy-error enforcement control and observability docs --- README.md | 1 + cmd/datafog-api/main.go | 14 ++- cmd/datafog-shim/main.go | 153 +++++++++++++++++++------ cmd/datafog-shim/main_test.go | 133 ++++++++++++++++++++- docs/ARCHITECTURE.md | 2 +- docs/DATA.md | 4 +- docs/contracts/datafog-api-contract.md | 14 ++- internal/receipts/store.go | 9 +- internal/receipts/store_test.go | 27 +++++ internal/server/server.go | 9 +- internal/server/server_test.go | 57 +++++++++ internal/shim/enforcer.go | 40 ++++--- internal/shim/enforcer_test.go | 24 ++++ 13 files changed, 426 insertions(+), 61 deletions(-) diff --git a/README.md b/README.md index 018b205..e8156ac 100644 --- a/README.md +++ b/README.md @@ -239,6 +239,7 @@ Common env vars for the policy gate: - `DATAFOG_SHIM_API_TOKEN` (required if API token is enabled) - `DATAFOG_SHIM_MODE` (`enforced` or `observe`) - `DATAFOG_SHIM_EVENT_SINK` (optional NDJSON sink) +- `DATAFOG_SHIM_ENFORCE_POLICY_ERRORS` (`true` to block on policy service errors even in observe mode) When using `enforced` mode, a blocked action exits non-zero. In `observe` mode, it logs decisions but allows execution to continue. diff --git a/cmd/datafog-api/main.go b/cmd/datafog-api/main.go index 72ad11d..a441030 100644 --- a/cmd/datafog-api/main.go +++ b/cmd/datafog-api/main.go @@ -33,7 +33,7 @@ func main() { rateLimitRPS := getenvInt("DATAFOG_RATE_LIMIT_RPS", 0) shutdownTimeout := getenvDuration("DATAFOG_SHUTDOWN_TIMEOUT", 10*time.Second) enableDemo := getenv("DATAFOG_ENABLE_DEMO", "") != "" || hasFlag("--enable-demo") - eventsPath := getenv("DATAFOG_EVENTS_PATH", "datafog_events.ndjson") + eventsPath := getenv("DATAFOG_EVENTS_PATH", "") pprofAddr := getenv("DATAFOG_PPROF_ADDR", "") fgprofEnabled := getenvBool("DATAFOG_FGPROF", false) @@ -47,10 +47,18 @@ func main() { log.Fatalf("init receipts: %v", err) } - eventSink := shim.NewNDJSONDecisionEventSink(eventsPath) + var eventSink shim.DecisionEventSink + var eventReader shim.EventReader + if eventsPath != "" { + eventStore := shim.NewNDJSONDecisionEventSink(eventsPath) + eventSink = eventStore + eventReader = eventStore + } h := server.New(policyData, store, log.Default(), apiToken, rateLimitRPS) - h.SetEventReader(eventSink) + if eventReader != nil { + h.SetEventReader(eventReader) + } var handler http.Handler if enableDemo { diff --git a/cmd/datafog-shim/main.go b/cmd/datafog-shim/main.go index e833867..9079918 100644 --- a/cmd/datafog-shim/main.go +++ b/cmd/datafog-shim/main.go @@ -10,6 +10,7 @@ import ( "path/filepath" "runtime" "sort" + "strconv" "strings" "github.com/datafog/datafog-api/internal/shim" @@ -22,21 +23,24 @@ const ( ) type shimRuntimeConfig struct { - policyURL string - apiToken string - mode string - eventSink string - shimDir string - sensitive bool + policyURL string + apiToken string + mode string + eventSink string + shimDir string + sensitive bool + enforcePolicyErrors bool + enforcePolicyErrorsSet bool } type managedShimMetadata struct { - Command string - Adapter string - Target string - Mode string - PolicyURL string - EventSink string + Command string + Adapter string + Target string + Mode string + PolicyURL string + EventSink string + EnforcePolicyErrors bool } func main() { @@ -50,6 +54,7 @@ func run(argv []string) error { flags := flag.NewFlagSet("datafog-shim", flag.ContinueOnError) policyURL := flags.String("policy-url", "", "base URL for datafog API (for example http://localhost:8080)") apiToken := flags.String("api-token", "", "API token for policy decisions") + enforcePolicyErrors := flags.String("enforce-policy-errors", "", "set to true to block command execution when policy service check fails") mode := flags.String("mode", "", "enforcement mode: enforced|observe") eventSink := flags.String("event-sink", "", "path for NDJSON decision event sink") shimDir := flags.String("shim-dir", "", "directory for installed adapter shims") @@ -58,13 +63,20 @@ func run(argv []string) error { return err } + enforcePolicyErrorsValue, enforcePolicyErrorsSet, err := parseBoolOption(*enforcePolicyErrors) + if err != nil { + return err + } + cfg, err := resolveRuntimeConfig(shimRuntimeConfig{ - policyURL: *policyURL, - apiToken: *apiToken, - mode: *mode, - eventSink: *eventSink, - shimDir: *shimDir, - sensitive: *sensitive, + policyURL: *policyURL, + apiToken: *apiToken, + mode: *mode, + eventSink: *eventSink, + shimDir: *shimDir, + sensitive: *sensitive, + enforcePolicyErrors: enforcePolicyErrorsValue, + enforcePolicyErrorsSet: enforcePolicyErrorsSet, }) if err != nil { return err @@ -96,13 +108,32 @@ func run(argv []string) error { } func resolveRuntimeConfig(input shimRuntimeConfig) (shimRuntimeConfig, error) { + enforcePolicyErrors := false + enforcePolicyErrorsSet := false + if input.enforcePolicyErrorsSet { + enforcePolicyErrors = input.enforcePolicyErrors + enforcePolicyErrorsSet = true + } else { + envRaw := strings.TrimSpace(os.Getenv("DATAFOG_SHIM_ENFORCE_POLICY_ERRORS")) + if envRaw != "" { + parsed, err := parseBoolValue(envRaw) + if err != nil { + return shimRuntimeConfig{}, err + } + enforcePolicyErrors = parsed + enforcePolicyErrorsSet = true + } + } + cfg := shimRuntimeConfig{ - policyURL: coalesce(input.policyURL, os.Getenv("DATAFOG_SHIM_POLICY_URL"), defaultPolicyURL), - apiToken: coalesce(input.apiToken, os.Getenv("DATAFOG_SHIM_API_TOKEN")), - mode: coalesce(input.mode, os.Getenv("DATAFOG_SHIM_MODE"), string(shim.ModeEnforced)), - eventSink: coalesce(input.eventSink, os.Getenv("DATAFOG_SHIM_EVENT_SINK")), - shimDir: coalesce(input.shimDir, os.Getenv("DATAFOG_SHIM_DIR"), defaultShimDir()), - sensitive: input.sensitive, + policyURL: coalesce(input.policyURL, os.Getenv("DATAFOG_SHIM_POLICY_URL"), defaultPolicyURL), + apiToken: coalesce(input.apiToken, os.Getenv("DATAFOG_SHIM_API_TOKEN")), + mode: coalesce(input.mode, os.Getenv("DATAFOG_SHIM_MODE"), string(shim.ModeEnforced)), + eventSink: coalesce(input.eventSink, os.Getenv("DATAFOG_SHIM_EVENT_SINK")), + shimDir: coalesce(input.shimDir, os.Getenv("DATAFOG_SHIM_DIR"), defaultShimDir()), + sensitive: input.sensitive, + enforcePolicyErrors: enforcePolicyErrors, + enforcePolicyErrorsSet: enforcePolicyErrorsSet, } parsedMode, err := parseMode(cfg.mode) @@ -124,6 +155,25 @@ func parseMode(raw string) (shim.EnforcementMode, error) { } } +func parseBoolValue(raw string) (bool, error) { + parsed, err := strconv.ParseBool(strings.TrimSpace(raw)) + if err != nil { + return false, fmt.Errorf("invalid boolean value %q", raw) + } + return parsed, nil +} + +func parseBoolOption(raw string) (bool, bool, error) { + if strings.TrimSpace(raw) == "" { + return false, false, nil + } + value, err := parseBoolValue(raw) + if err != nil { + return false, false, err + } + return value, true, nil +} + func coalesce(values ...string) string { for _, value := range values { if strings.TrimSpace(value) != "" { @@ -141,6 +191,7 @@ func newGate(cfg shimRuntimeConfig) *shim.Gate { } opts := []shim.GateOption{ shim.WithMode(mode), + shim.WithEnforcePolicyErrors(cfg.enforcePolicyErrors), } if strings.TrimSpace(cfg.eventSink) != "" { opts = append(opts, shim.WithEventSink(shim.NewNDJSONDecisionEventSink(cfg.eventSink))) @@ -214,6 +265,7 @@ func runCommandAdapter(ctx context.Context, cfg shimRuntimeConfig, args []string adapter := flags.String("adapter", "", "tool adapter name") target := flags.String("target", "", "binary or command to execute") overrideMode := flags.String("mode", "", "enforcement mode: enforced|observe") + overrideEnforcePolicyErrors := flags.String("enforce-policy-errors", "", "set to true to block command execution when policy service fails") policyURL := flags.String("policy-url", "", "base URL for datafog API (for example http://localhost:8080)") apiToken := flags.String("api-token", "", "API token for policy decisions") eventSink := flags.String("event-sink", "", "path for NDJSON decision event sink") @@ -222,15 +274,27 @@ func runCommandAdapter(ctx context.Context, cfg shimRuntimeConfig, args []string return err } - var err error - cfg, err = resolveRuntimeConfig(shimRuntimeConfig{ - policyURL: coalesce(*policyURL, cfg.policyURL), - apiToken: coalesce(*apiToken, cfg.apiToken), - mode: coalesce(*overrideMode, cfg.mode), - eventSink: coalesce(*eventSink, cfg.eventSink), - shimDir: cfg.shimDir, - sensitive: *sensitive || cfg.sensitive, - }) + overrideEnforcePolicyErrorsValue, overrideEnforcePolicyErrorsSet, err := parseBoolOption(*overrideEnforcePolicyErrors) + if err != nil { + return err + } + + resolveCfg := shimRuntimeConfig{ + policyURL: coalesce(*policyURL, cfg.policyURL), + apiToken: coalesce(*apiToken, cfg.apiToken), + mode: coalesce(*overrideMode, cfg.mode), + eventSink: coalesce(*eventSink, cfg.eventSink), + shimDir: cfg.shimDir, + sensitive: *sensitive || cfg.sensitive, + enforcePolicyErrors: cfg.enforcePolicyErrors, + enforcePolicyErrorsSet: cfg.enforcePolicyErrorsSet, + } + if overrideEnforcePolicyErrorsSet { + resolveCfg.enforcePolicyErrors = overrideEnforcePolicyErrorsValue + resolveCfg.enforcePolicyErrorsSet = true + } + + cfg, err = resolveRuntimeConfig(resolveCfg) if err != nil { return err } @@ -293,11 +357,17 @@ func runHooksInstall(cfg shimRuntimeConfig, argv []string) error { overrideMode := flags.String("mode", "", "override enforcement mode for this shim") overridePolicyURL := flags.String("policy-url", "", "override policy URL for this shim") overrideEventSink := flags.String("event-sink", "", "override event sink path for this shim") + overrideEnforcePolicyErrors := flags.String("enforce-policy-errors", "", "set to true to block command execution when policy service fails") shimDir := flags.String("shim-dir", "", "directory for generated shim") if err := flags.Parse(argv); err != nil { return err } + overrideEnforcePolicyErrorsValue, overrideEnforcePolicyErrorsSet, err := parseBoolOption(*overrideEnforcePolicyErrors) + if err != nil { + return err + } + args := flags.Args() if len(args) != 1 { return fmt.Errorf("hooks install expects one command name") @@ -307,6 +377,10 @@ func runHooksInstall(cfg shimRuntimeConfig, argv []string) error { installCfg.mode = coalesce(*overrideMode, cfg.mode) installCfg.policyURL = coalesce(*overridePolicyURL, cfg.policyURL) installCfg.eventSink = coalesce(*overrideEventSink, cfg.eventSink) + if overrideEnforcePolicyErrorsSet { + installCfg.enforcePolicyErrors = overrideEnforcePolicyErrorsValue + installCfg.enforcePolicyErrorsSet = true + } if *shimDir != "" { installCfg.shimDir = *shimDir } @@ -502,6 +576,7 @@ func installShimScript(shimBinary string, cfg shimRuntimeConfig, command, adapte string(mode), cfg.policyURL, cfg.eventSink, + cfg.enforcePolicyErrors, ) if err := os.WriteFile(shimPath, []byte(content), 0o755); err != nil { @@ -586,6 +661,10 @@ func readShimMetadata(path string) (managedShimMetadata, bool, error) { meta.PolicyURL = val case "EVENT_SINK": meta.EventSink = val + case "ENFORCE_POLICY_ERRORS": + if parsed, _, err := parseBoolOption(val); err == nil { + meta.EnforcePolicyErrors = parsed + } } } if err := scanner.Err(); err != nil { @@ -594,7 +673,7 @@ func readShimMetadata(path string) (managedShimMetadata, bool, error) { return meta, isManaged, nil } -func buildShimScript(shimBinary, command, adapter, target, mode, policyURL, eventSink string) string { +func buildShimScript(shimBinary, command, adapter, target, mode, policyURL, eventSink string, enforcePolicyErrors bool) string { lines := []string{ "#!/bin/sh", "set -eu", @@ -605,11 +684,13 @@ func buildShimScript(shimBinary, command, adapter, target, mode, policyURL, even "# DATAFOG_SHIM_MODE=" + mode, "# DATAFOG_SHIM_POLICY_URL=" + policyURL, "# DATAFOG_SHIM_EVENT_SINK=" + eventSink, + "# DATAFOG_SHIM_ENFORCE_POLICY_ERRORS=" + strconv.FormatBool(enforcePolicyErrors), "", "SHIM_BINARY=" + shQuote(shimBinary), "SHIM_MODE=" + shQuote(mode), "SHIM_POLICY_URL=" + shQuote(policyURL), "SHIM_EVENT_SINK=" + shQuote(eventSink), + "SHIM_ENFORCE_POLICY_ERRORS=" + shQuote(strconv.FormatBool(enforcePolicyErrors)), "if [ -n \"${DATAFOG_SHIM_MODE:-}\" ]; then", " SHIM_MODE=\"$DATAFOG_SHIM_MODE\"", "fi", @@ -619,6 +700,9 @@ func buildShimScript(shimBinary, command, adapter, target, mode, policyURL, even "if [ -n \"${DATAFOG_SHIM_EVENT_SINK:-}\" ]; then", " SHIM_EVENT_SINK=\"$DATAFOG_SHIM_EVENT_SINK\"", "fi", + "if [ -n \"${DATAFOG_SHIM_ENFORCE_POLICY_ERRORS:-}\" ]; then", + " SHIM_ENFORCE_POLICY_ERRORS=\"$DATAFOG_SHIM_ENFORCE_POLICY_ERRORS\"", + "fi", "", `exec "$SHIM_BINARY" run \`, ` --adapter "` + shellEscape(adapter) + `" \`, @@ -626,6 +710,7 @@ func buildShimScript(shimBinary, command, adapter, target, mode, policyURL, even ` --mode "$SHIM_MODE" \`, ` --policy-url "$SHIM_POLICY_URL" \`, ` --event-sink "$SHIM_EVENT_SINK" \`, + ` --enforce-policy-errors "$SHIM_ENFORCE_POLICY_ERRORS" \`, ` --api-token "${DATAFOG_SHIM_API_TOKEN:-}" \`, ` -- \`, ` "$@"`, diff --git a/cmd/datafog-shim/main_test.go b/cmd/datafog-shim/main_test.go index d32b368..1c80e70 100644 --- a/cmd/datafog-shim/main_test.go +++ b/cmd/datafog-shim/main_test.go @@ -43,9 +43,48 @@ func TestParseMode(t *testing.T) { }) } +func TestParseBoolOption(t *testing.T) { + t.Run("empty", func(t *testing.T) { + got, set, err := parseBoolOption("") + if err != nil { + t.Fatalf("expected parse success, got %v", err) + } + if set { + t.Fatal("expected unset flag for empty value") + } + if got { + t.Fatal("expected false when unset") + } + }) + t.Run("true", func(t *testing.T) { + got, set, err := parseBoolOption("true") + if err != nil { + t.Fatalf("expected parse success, got %v", err) + } + if !set || !got { + t.Fatalf("expected parsed true value, got %v (set=%v)", got, set) + } + }) + t.Run("false", func(t *testing.T) { + got, set, err := parseBoolOption("false") + if err != nil { + t.Fatalf("expected parse success, got %v", err) + } + if !set || got { + t.Fatalf("expected parsed false value, got %v (set=%v)", got, set) + } + }) + t.Run("invalid", func(t *testing.T) { + if _, _, err := parseBoolOption("notbool"); err == nil { + t.Fatal("expected parse error") + } + }) +} + func TestResolveRuntimeConfig(t *testing.T) { t.Setenv("DATAFOG_SHIM_POLICY_URL", "http://env:8080") t.Setenv("DATAFOG_SHIM_MODE", string(shim.ModeObserve)) + t.Setenv("DATAFOG_SHIM_ENFORCE_POLICY_ERRORS", "true") cfg, err := resolveRuntimeConfig(shimRuntimeConfig{}) if err != nil { @@ -57,9 +96,33 @@ func TestResolveRuntimeConfig(t *testing.T) { if cfg.mode != string(shim.ModeObserve) { t.Fatalf("expected observe mode, got %q", cfg.mode) } + if !cfg.enforcePolicyErrors { + t.Fatalf("expected enforce policy errors from env to be true") + } + if !cfg.enforcePolicyErrorsSet { + t.Fatalf("expected enforce-policy-errors set when env value present") + } if cfg.shimDir == "" { t.Fatal("expected shim directory fallback") } + + cfg, err = resolveRuntimeConfig(shimRuntimeConfig{policyURL: "http://flag:8080", enforcePolicyErrors: false, enforcePolicyErrorsSet: true}) + if err != nil { + t.Fatalf("expected explicit enforce-policy config to resolve, got %v", err) + } + if cfg.policyURL != "http://flag:8080" { + t.Fatalf("expected CLI policy URL, got %q", cfg.policyURL) + } + if cfg.enforcePolicyErrors { + t.Fatalf("expected explicit enforce policy errors false, got true") + } +} + +func TestResolveRuntimeConfigInvalidEnforcePolicyErrors(t *testing.T) { + t.Setenv("DATAFOG_SHIM_ENFORCE_POLICY_ERRORS", "not-bool") + if _, err := resolveRuntimeConfig(shimRuntimeConfig{}); err == nil { + t.Fatal("expected invalid enforce-policy-errors env value to fail") + } } func TestResolveTargetBinary(t *testing.T) { @@ -111,6 +174,7 @@ func TestBuildShimScript(t *testing.T) { string(shim.ModeObserve), "http://localhost:8080", "/tmp/events.ndjson", + true, ) if !strings.Contains(script, shimMarker) { t.Fatalf("script missing shim marker") @@ -121,9 +185,15 @@ func TestBuildShimScript(t *testing.T) { if !strings.Contains(script, "# DATAFOG_SHIM_TARGET=/usr/bin/git") { t.Fatalf("script missing target metadata") } + if !strings.Contains(script, "# DATAFOG_SHIM_ENFORCE_POLICY_ERRORS=true") { + t.Fatalf("script missing enforce policy errors metadata") + } if !strings.Contains(script, `--mode "$SHIM_MODE"`) { t.Fatalf("script missing runtime mode wiring") } + if !strings.Contains(script, `--enforce-policy-errors "$SHIM_ENFORCE_POLICY_ERRORS"`) { + t.Fatalf("script missing enforce policy errors wiring") + } } func TestInstallListAndUninstallShim(t *testing.T) { @@ -144,9 +214,11 @@ func TestInstallListAndUninstallShim(t *testing.T) { } cfg := shimRuntimeConfig{ - policyURL: "http://localhost:8080", - mode: string(shim.ModeEnforced), - shimDir: shimDir, + policyURL: "http://localhost:8080", + mode: string(shim.ModeEnforced), + shimDir: shimDir, + enforcePolicyErrors: true, + enforcePolicyErrorsSet: true, } shimPath, err := installShimScript(fakeShimBinary, cfg, "git", "git", targetBinary, false) @@ -171,6 +243,9 @@ func TestInstallListAndUninstallShim(t *testing.T) { if found.Adapter != "vcs" { t.Fatalf("expected adapter vcs, got %q", found.Adapter) } + if found.EnforcePolicyErrors != true { + t.Fatalf("expected enforce policy errors metadata true, got %v", found.EnforcePolicyErrors) + } list, err := listManagedShims(shimDir) if err != nil { @@ -194,6 +269,58 @@ func TestInstallListAndUninstallShim(t *testing.T) { } } +func TestHooksInstallHonorsEnforcePolicyErrorsOverride(t *testing.T) { + root := t.TempDir() + shimDir := filepath.Join(root, "shims") + targetDir := filepath.Join(root, "targets") + if err := os.MkdirAll(targetDir, 0o755); err != nil { + t.Fatalf("mkdir targets: %v", err) + } + + targetBinary := filepath.Join(targetDir, "git") + if err := os.WriteFile(targetBinary, []byte(""), 0o755); err != nil { + t.Fatalf("write target binary: %v", err) + } + fakeShimBinary := filepath.Join(root, "datafog-shim") + if err := os.WriteFile(fakeShimBinary, []byte("#!/bin/sh\necho shim\n"), 0o755); err != nil { + t.Fatalf("write shim binary: %v", err) + } + + baseCfg := shimRuntimeConfig{ + policyURL: "http://localhost:8080", + mode: string(shim.ModeObserve), + shimDir: shimDir, + enforcePolicyErrors: true, + enforcePolicyErrorsSet: true, + } + + if err := runHooksInstall(baseCfg, []string{ + "--target", + targetBinary, + "--enforce-policy-errors", + "false", + "git", + }); err != nil { + t.Fatalf("runHooksInstall failed: %v", err) + } + + shimPath := shimScriptPath(shimDir, "git") + found, managed, err := readShimMetadata(shimPath) + if err != nil { + t.Fatalf("read metadata: %v", err) + } + if !managed { + t.Fatal("expected managed shim") + } + if found.EnforcePolicyErrors { + t.Fatalf("expected override to disable enforce policy errors, got true") + } + + if err := os.Remove(shimPath); err != nil { + t.Fatalf("remove shim path: %v", err) + } +} + func TestAdapterResolution(t *testing.T) { if adapter := resolveAdapter("", "/usr/bin/git"); adapter != "vcs" { t.Fatalf("expected git to resolve to vcs, got %q", adapter) diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index fdf51fb..05eb451 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -31,7 +31,7 @@ For enforcement: CLI/tool action → `datafog-shim` policy gate → `POST /v1/de - `/health` and `/v1/policy/version` do not mutate state. - No secrets are logged or returned in API responses. - Decision side effects are request-scoped and serialized into receipts before returning a `decide` response. -- If a request includes idempotency keys, repeated requests must return identical status/body or a conflict error. +- If a request includes idempotency keys, repeated requests must return identical status/body or a conflict error within a live process; idempotency caches are in-memory and clear on restart. - Unsupported or unauthenticated requests fail closed (`401`, `4xx`, or `405`) before any enforcement action. ## Details Live Elsewhere diff --git a/docs/DATA.md b/docs/DATA.md index 511c291..b961c64 100644 --- a/docs/DATA.md +++ b/docs/DATA.md @@ -33,7 +33,9 @@ No schema migration framework exists currently; backfills are manual and should - Receipt IDs and action/input hashes must remain consistent for auditability. - Receipt reads/writes are append-only (`Save` appends a JSON line and fsyncs). -- On startup, the service loads existing receipts into memory; duplicate IDs are naturally coalesced by key in-memory map. +- On startup, valid existing receipts are loaded into memory and duplicate receipt IDs are coalesced by key in the in-memory map. +- Malformed existing receipt lines are skipped so the service can continue if one line is corrupted. +- Idempotency replay caches are in-memory and are not persisted across restarts. - Policy validation runs at startup and rejects invalid schemas before serving traffic. ## Sensitive Data Notes diff --git a/docs/contracts/datafog-api-contract.md b/docs/contracts/datafog-api-contract.md index 5173635..1543e09 100644 --- a/docs/contracts/datafog-api-contract.md +++ b/docs/contracts/datafog-api-contract.md @@ -103,16 +103,23 @@ Returns coarse-grained service telemetry for operations and routing. "/v1/decide": 3, "/_not_found": 3 }, + "by_path_avg_latency_ms": { + "/health": 12.5, + "/v1/scan": 18.2, + "/v1/decide": 22.8, + "/_not_found": 0.9 + }, "by_method": { "GET": 14, "POST": 28 }, + "avg_latency_ms": 11.2, "started_at": "RFC3339 timestamp", "uptime_seconds": 12.34 } ``` -`by_status`, `by_path`, and `by_method` include counters for completed requests observed before each `/metrics` call. `/metrics` request details appear on subsequent polling. +`by_status`, `by_path`, `by_method`, `avg_latency_ms`, and `by_path_avg_latency_ms` include metrics for completed requests observed before each `/metrics` call. `/metrics` request details appear on subsequent polling; latency fields are reported in milliseconds. ### `POST /v1/scan` @@ -340,7 +347,7 @@ Query params: - `after` (RFC3339 timestamp) - `before` (RFC3339 timestamp) - `decision` (`allow|transform|allow_with_redaction|deny`) -- `adapter` (tool/adapter filter, e.g. `claude`, `codex`) +- `adapter` (tool/adapter filter, case-insensitive; `vcs` and `claude` are canonicalized forms) ```json { @@ -375,6 +382,7 @@ If no events are configured or none match, return `{"events":[],"total":0}`. - Supported endpoints: `POST /v1/scan`, `POST /v1/decide`, `POST /v1/transform`, `POST /v1/anonymize`. - Replaying the same idempotency key and identical semantic payload returns the same status and body. - Reusing a key with different payloads returns `409` and `code: idempotency_conflict`. +- Idempotency keys are stored in server memory; restarting the API process clears request deduplication history. ## Optional demo endpoints (if enabled) @@ -388,3 +396,5 @@ The following routes are only available when the server is started with `DATAFOG - `GET /demo/sandbox` These return JSON payloads for execution and file operations and are intended for documentation/demo purposes only. + +- `POST /demo/seed` is a documentation helper route that bypasses policy enforcement and should not be used for production workflows. diff --git a/internal/receipts/store.go b/internal/receipts/store.go index dbd7f00..d8e453b 100644 --- a/internal/receipts/store.go +++ b/internal/receipts/store.go @@ -155,6 +155,8 @@ func (s *ReceiptStore) loadExistingReceipts() error { scanner := bufio.NewScanner(f) scanner.Buffer(make([]byte, 64*1024), maxReceiptLineBytes) + + badLines := 0 for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) if line == "" { @@ -162,7 +164,8 @@ func (s *ReceiptStore) loadExistingReceipts() error { } var receipt models.Receipt if err := json.Unmarshal([]byte(line), &receipt); err != nil { - return fmt.Errorf("decode existing receipt: %w", err) + badLines++ + continue } s.receipts[receipt.ReceiptID] = receipt s.entryCount++ @@ -170,6 +173,10 @@ func (s *ReceiptStore) loadExistingReceipts() error { if err := scanner.Err(); err != nil { return err } + + if badLines > 0 && len(s.receipts) == 0 { + return fmt.Errorf("decode existing receipt: no valid receipt records found in %s (bad lines: %d)", s.filePath, badLines) + } return nil } diff --git a/internal/receipts/store_test.go b/internal/receipts/store_test.go index 8cf85b2..4fc0dda 100644 --- a/internal/receipts/store_test.go +++ b/internal/receipts/store_test.go @@ -72,6 +72,33 @@ func TestReceiptStoreLoadsExistingReceipts(t *testing.T) { } } +func TestReceiptStoreSkipsCorruptReceiptLines(t *testing.T) { + path := t.TempDir() + "/receipts.jsonl" + good := models.Receipt{ + ReceiptID: "receipt-good", + PolicyID: "policy-1", + PolicyVersion: "v1", + Decision: models.DecisionAllow, + } + data, err := json.Marshal(good) + if err != nil { + t.Fatalf("marshal good receipt failed: %v", err) + } + if err := os.WriteFile(path, append([]byte("{\n"), append(data, '\n')...), 0o644); err != nil { + t.Fatalf("seed file write failed: %v", err) + } + + store, err := NewReceiptStore(path) + if err != nil { + t.Fatalf("expected corrupt+good receipts to load, got %v", err) + } + if got, ok := store.Get("receipt-good"); !ok { + t.Fatalf("expected good receipt loaded") + } else if got.ReceiptID != "receipt-good" { + t.Fatalf("expected loaded receipt id, got %q", got.ReceiptID) + } +} + func TestReceiptStoreRejectsCorruptReceiptLine(t *testing.T) { path := t.TempDir() + "/receipts.jsonl" if err := os.WriteFile(path, []byte("{\n"), 0o644); err != nil { diff --git a/internal/server/server.go b/internal/server/server.go index f10b6f1..fb1e64c 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -18,6 +18,7 @@ import ( "sync" "time" + "github.com/datafog/datafog-api/internal/adapters" "github.com/datafog/datafog-api/internal/models" "github.com/datafog/datafog-api/internal/policy" "github.com/datafog/datafog-api/internal/receipts" @@ -707,8 +708,12 @@ func (s *Server) handleEvents(w http.ResponseWriter, r *http.Request) { if decision := r.URL.Query().Get("decision"); decision != "" { q.Decision = decision } - if adapter := r.URL.Query().Get("adapter"); adapter != "" { - q.Adapter = adapter + if adapter := strings.TrimSpace(r.URL.Query().Get("adapter")); adapter != "" { + if canonical, ok := adapters.Canonical(strings.ToLower(adapter)); ok { + q.Adapter = canonical + } else { + q.Adapter = strings.ToLower(adapter) + } } if limitStr := r.URL.Query().Get("limit"); limitStr != "" { if n, err := strconv.Atoi(limitStr); err == nil && n > 0 && n <= 1000 { diff --git a/internal/server/server_test.go b/internal/server/server_test.go index cf25b91..5181c6f 100644 --- a/internal/server/server_test.go +++ b/internal/server/server_test.go @@ -11,6 +11,7 @@ import ( "github.com/datafog/datafog-api/internal/models" "github.com/datafog/datafog-api/internal/receipts" + "github.com/datafog/datafog-api/internal/shim" ) func testPolicy() models.Policy { @@ -44,6 +45,30 @@ func makeServerWithTokenAndRateLimit(t *testing.T, apiToken string, rateLimitRPS return &http.Server{Handler: h.Handler()} } +type fakeEventReader struct { + events []shim.DecisionEvent +} + +func (r fakeEventReader) Query(q shim.EventQuery) ([]shim.DecisionEvent, error) { + if q.Limit <= 0 { + q.Limit = 1000 + } + filtered := make([]shim.DecisionEvent, 0, len(r.events)) + for _, event := range r.events { + if q.Adapter != "" && event.Tool != q.Adapter { + continue + } + if q.Decision != "" && string(event.Decision) != q.Decision { + continue + } + filtered = append(filtered, event) + if len(filtered) >= q.Limit { + break + } + } + return filtered, nil +} + func TestHealthEndpoint(t *testing.T) { server := makeServer(t) req := httptest.NewRequest(http.MethodGet, "/health", nil) @@ -909,6 +934,38 @@ func TestMetricsMethodNotAllowed(t *testing.T) { assertJSONError(t, resp, http.StatusMethodNotAllowed, "method_not_allowed") } +func TestEventsAdapterFilterCanonicalized(t *testing.T) { + s, err := receipts.NewReceiptStore(t.TempDir() + "/receipts.jsonl") + if err != nil { + t.Fatalf("new store: %v", err) + } + h := New(testPolicy(), s, nil, "", 0) + h.SetEventReader(fakeEventReader{events: []shim.DecisionEvent{ + {Tool: "claude", Decision: string(models.DecisionAllow)}, + {Tool: "vcs", Decision: string(models.DecisionDeny)}, + }}) + + req := httptest.NewRequest(http.MethodGet, "/v1/events?adapter=CLAUDE", nil) + resp := httptest.NewRecorder() + h.Handler().ServeHTTP(resp, req) + if resp.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", resp.Code) + } + var got struct { + Events []shim.DecisionEvent `json:"events"` + Total int `json:"total"` + } + if err := json.NewDecoder(resp.Body).Decode(&got); err != nil { + t.Fatalf("decode failed: %v", err) + } + if got.Total != 1 { + t.Fatalf("expected one canonicalized adapter match, got %d", got.Total) + } + if len(got.Events) != 1 || got.Events[0].Tool != "claude" { + t.Fatalf("expected canonicalized filter to match claude, got %#v", got.Events) + } +} + func TestInvalidJSONHandling(t *testing.T) { server := makeServer(t) diff --git a/internal/shim/enforcer.go b/internal/shim/enforcer.go index 17de779..c55e6e2 100644 --- a/internal/shim/enforcer.go +++ b/internal/shim/enforcer.go @@ -50,23 +50,31 @@ func WithEventSink(sink DecisionEventSink) GateOption { } } +func WithEnforcePolicyErrors(enabled bool) GateOption { + return func(g *Gate) { + g.EnforcePolicyErrors = enabled + } +} + type Gate struct { - Client DecisionClient - Runner CommandRunner - Reader FileReader - Writer FileWriter - Mode EnforcementMode - EventSink DecisionEventSink + Client DecisionClient + Runner CommandRunner + Reader FileReader + Writer FileWriter + Mode EnforcementMode + EventSink DecisionEventSink + EnforcePolicyErrors bool } func NewGate(client DecisionClient, opts ...GateOption) *Gate { g := &Gate{ - Client: client, - Runner: &osCommandRunner{}, - Reader: &osFileReader{}, - Writer: &osFileWriter{}, - Mode: ModeEnforced, - EventSink: noopEventSink{}, + Client: client, + Runner: &osCommandRunner{}, + Reader: &osFileReader{}, + Writer: &osFileWriter{}, + Mode: ModeEnforced, + EventSink: noopEventSink{}, + EnforcePolicyErrors: false, } for _, opt := range opts { if opt != nil { @@ -139,10 +147,14 @@ func (r *Gate) shouldAllow(decision models.Decision) bool { return r.Mode == ModeObserve } +func (r *Gate) shouldBlockOnPolicyError() bool { + return r.Mode == ModeEnforced || r.EnforcePolicyErrors +} + func (r *Gate) executeRequest(ctx context.Context, req models.DecideRequest, run func(context.Context) ([]byte, error)) (models.DecideResponse, []byte, error) { result, err := r.Check(ctx, req) if err != nil { - if r.Mode == ModeEnforced { + if r.shouldBlockOnPolicyError() { r.recordDecisionEvent(req, result, false, err) return result, nil, err } @@ -231,7 +243,7 @@ func (r *Gate) WriteFile(ctx context.Context, path string, data []byte, perm fs. req := r.readRequest(action, text, findings) result, err := r.Check(ctx, req) if err != nil { - if r.Mode == ModeEnforced { + if r.shouldBlockOnPolicyError() { r.recordDecisionEvent(req, result, false, err) return result, err } diff --git a/internal/shim/enforcer_test.go b/internal/shim/enforcer_test.go index 1df77ae..80b8969 100644 --- a/internal/shim/enforcer_test.go +++ b/internal/shim/enforcer_test.go @@ -209,6 +209,30 @@ func TestShellExecutionPolicyErrorPassesInObserveMode(t *testing.T) { } } +func TestShellExecutionPolicyErrorBlocksInObserveModeWhenConfigured(t *testing.T) { + decider := &fakeDecisionClient{ + err: errors.New("policy unavailable"), + } + runner := &fakeCommandRunner{out: []byte("ok\n")} + recorder := &fakeEventRecorder{} + interceptor := NewGate(decider, WithMode(ModeObserve), WithEnforcePolicyErrors(true), WithEventSink(recorder)) + interceptor.Runner = runner + + _, _, err := interceptor.ExecuteShell(context.Background(), "ls", nil, "", nil, false) + if err == nil { + t.Fatalf("expected policy error to block in observe mode when configured") + } + if runner.called { + t.Fatalf("expected command runner to be blocked on policy error") + } + if len(recorder.events) != 1 || recorder.events[0].Mode != string(ModeObserve) { + t.Fatalf("expected one observe-mode event, got %#v", recorder.events) + } + if recorder.events[0].Allowed != false { + t.Fatalf("expected blocked event, got %#v", recorder.events[0]) + } +} + func TestCommandAdapterExecution(t *testing.T) { decider := &fakeDecisionClient{ response: models.DecideResponse{