"}'
+ . "\nDo not wrap the JSON in markdown or prose.";
+
+ public function __construct(
+ private readonly RequestLogRepository $logRepository,
+ private readonly ProviderConfigurationRepository $configurationRepository,
+ private readonly ProviderResolver $resolver,
+ private readonly LoggerInterface $logger,
+ ) {}
+
+ /**
+ * Grade the request log row identified by $logUid.
+ *
+ * Idempotent in spirit: callers should ensure they only invoke this for
+ * rows in grade_status='pending'. The method itself does not double-check —
+ * the shutdown handler holds the freshly inserted uid and the scheduler
+ * filters by status, so a re-entry would only happen on a true race.
+ */
+ public function grade(int $logUid): void
+ {
+ if ($logUid <= 0) {
+ return;
+ }
+
+ $start = hrtime(true);
+ try {
+ $row = $this->logRepository->findByUid($logUid);
+ if ($row === null) {
+ return;
+ }
+
+ $primaryConfig = $this->configurationRepository->findByUid((int)($row['configuration_uid'] ?? 0));
+ if ($primaryConfig === null || !$primaryConfig->gradingEnabled) {
+ // Config gone or grading turned off between insert and grade — nothing to do.
+ return;
+ }
+
+ $judgeUid = $primaryConfig->judgeConfigurationUid;
+ if ($judgeUid <= 0 || $judgeUid === $primaryConfig->uid) {
+ $this->logRepository->markGradeFailed(
+ $logUid,
+ 'Judge configuration is missing or points at the same configuration.',
+ );
+ return;
+ }
+
+ $prompt = (string)($row['request_prompt'] ?? '');
+ $response = (string)($row['response_content'] ?? '');
+ if ($prompt === '' || $response === '') {
+ $this->logRepository->markGradeFailed(
+ $logUid,
+ 'Prompt or response content is empty (likely redacted) — cannot grade.',
+ );
+ return;
+ }
+
+ $judgeResolved = $this->resolver->resolveForCapability(ConversationCapableInterface::class, $judgeUid);
+ $judgeProvider = $judgeResolved->manifest->getInstance();
+ if (!$judgeProvider instanceof ConversationCapableInterface) {
+ $this->logRepository->markGradeFailed(
+ $logUid,
+ 'Judge provider does not support conversation capability.',
+ );
+ return;
+ }
+
+ $judgeRequest = $this->buildJudgeRequest($primaryConfig, $judgeResolved->configuration, $prompt, $response, $logUid);
+ $judgeResponse = $judgeProvider->processConversationRequest($judgeRequest);
+
+ // The judge call bypasses the middleware pipeline (would otherwise create
+ // a DI cycle and a duplicate request-log row), so CostTrackingMiddleware
+ // never sees it. Roll the spend into the judge configuration's total_cost
+ // here instead. This is done as soon as the call returns so a paid-for but
+ // unparseable response is still accounted for.
+ $this->configurationRepository->updateTotalCost($judgeUid, $judgeResponse->usage->cost);
+
+ if (!$judgeResponse->isSuccessful()) {
+ $error = $judgeResponse->errors[0] ?? 'Judge returned an unsuccessful response.';
+ $this->logRepository->markGradeFailed($logUid, $error);
+ return;
+ }
+
+ $parsed = $this->parseJudgeOutput($judgeResponse->content);
+ if ($parsed === null) {
+ $this->logRepository->markGradeFailed(
+ $logUid,
+ 'Judge response could not be parsed as JSON: ' . mb_substr($judgeResponse->content, 0, 200),
+ );
+ return;
+ }
+
+ $durationMs = (int)((hrtime(true) - $start) / 1_000_000);
+ $this->logRepository->updateGrade(
+ $logUid,
+ $parsed['score'],
+ $parsed['label'],
+ $parsed['reason'],
+ $judgeResponse->usage->modelUsed ?: '',
+ $judgeResponse->usage->cost,
+ $durationMs,
+ );
+ } catch (\Throwable $e) {
+ $this->logger->warning('AiM grading failed for log uid ' . $logUid . ': ' . $e->getMessage());
+ $this->logRepository->markGradeFailed($logUid, $e->getMessage());
+ }
+ }
+
+ private function buildJudgeRequest(
+ ProviderConfiguration $primaryConfig,
+ ProviderConfiguration $judgeConfig,
+ string $prompt,
+ string $response,
+ int $logUid,
+ ): ConversationRequest {
+ $rubric = trim($primaryConfig->gradingRubric);
+ if ($rubric === '') {
+ $rubric = 'Evaluate the response for factual accuracy and relevance to the user prompt.';
+ }
+ $systemPrompt = $rubric . self::JSON_INSTRUCTION;
+
+ $userContent = "Prompt:\n" . $prompt . "\n\nResponse:\n" . $response;
+
+ return new ConversationRequest(
+ configuration: $judgeConfig,
+ messages: [new UserMessage($userContent)],
+ systemPrompt: $systemPrompt,
+ responseFormat: ResponseFormat::json(),
+ maxTokens: 300,
+ temperature: 0.0,
+ metadata: [
+ '_aim_grading' => true,
+ 'graded_log_uid' => $logUid,
+ 'extension' => 'aim',
+ ],
+ );
+ }
+
+ /**
+ * Parse and validate the judge's JSON output.
+ *
+ * @return array{score: float, label: GradeLabel, reason: string}|null
+ */
+ private function parseJudgeOutput(string $raw): ?array
+ {
+ $json = $this->extractJsonObject($raw);
+ if ($json === null) {
+ return null;
+ }
+ try {
+ $decoded = json_decode($json, true, 8, JSON_THROW_ON_ERROR);
+ } catch (\JsonException) {
+ return null;
+ }
+ if (!is_array($decoded) || !isset($decoded['score'], $decoded['label'])) {
+ return null;
+ }
+
+ $score = (float)$decoded['score'];
+ if ($score < 0.0) {
+ $score = 0.0;
+ }
+ if ($score > 1.0) {
+ $score = 1.0;
+ }
+
+ $label = GradeLabel::tryFrom(strtolower(trim((string)$decoded['label'])))
+ ?? GradeLabel::fromScore($score);
+
+ $reason = trim((string)($decoded['reason'] ?? ''));
+ if (mb_strlen($reason) > 500) {
+ $reason = mb_substr($reason, 0, 497) . '...';
+ }
+
+ return ['score' => $score, 'label' => $label, 'reason' => $reason];
+ }
+
+ /**
+ * Strip code fences and locate the first balanced JSON object in the string.
+ * Tolerates the common failure mode of LLMs wrapping JSON in markdown.
+ */
+ private function extractJsonObject(string $raw): ?string
+ {
+ $raw = trim($raw);
+ if ($raw === '') {
+ return null;
+ }
+ // Strip ```json ... ``` fence if present
+ if (str_starts_with($raw, '```')) {
+ $raw = preg_replace('/^```(?:json)?\s*\n?|\n?```$/m', '', $raw) ?? $raw;
+ $raw = trim($raw);
+ }
+ $start = strpos($raw, '{');
+ $end = strrpos($raw, '}');
+ if ($start === false || $end === false || $end <= $start) {
+ return null;
+ }
+ return substr($raw, $start, $end - $start + 1);
+ }
+}
diff --git a/Configuration/TCA/tx_aim_configuration.php b/Configuration/TCA/tx_aim_configuration.php
index 4a46e06..0fd2bed 100644
--- a/Configuration/TCA/tx_aim_configuration.php
+++ b/Configuration/TCA/tx_aim_configuration.php
@@ -32,7 +32,10 @@
--palette--;;cost,
--div--;LLL:EXT:core/Resources/Private/Language/Form/locallang_tabs.xlf:access,
--palette--;;access,
- --palette--;;governance',
+ --palette--;;governance,
+ --div--;LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.tab.grading,
+ --palette--;;grading,
+ grading_rubric',
],
],
'palettes' => [
@@ -56,6 +59,10 @@
'label' => 'LLL:EXT:frontend/Resources/Private/Language/locallang_tca.xlf:pages.palettes.access',
'showitem' => 'disabled',
],
+ 'grading' => [
+ 'label' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.palette.grading.label',
+ 'showitem' => 'grading_enabled, --linebreak--, judge_configuration_uid',
+ ],
],
'columns' => [
'ai_provider' => [
@@ -209,5 +216,40 @@
],
],
],
+ 'grading_enabled' => [
+ 'label' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.columns.grading_enabled.label',
+ 'description' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.columns.grading_enabled.description',
+ 'config' => [
+ 'type' => 'check',
+ 'renderType' => 'checkboxToggle',
+ 'default' => 0,
+ ],
+ ],
+ 'judge_configuration_uid' => [
+ 'label' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.columns.judge_configuration_uid.label',
+ 'description' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.columns.judge_configuration_uid.description',
+ 'displayCond' => 'FIELD:grading_enabled:REQ:true',
+ 'config' => [
+ 'type' => 'select',
+ 'renderType' => 'selectSingle',
+ 'foreign_table' => 'tx_aim_configuration',
+ 'foreign_table_where' => 'AND tx_aim_configuration.uid != ###THIS_UID### AND tx_aim_configuration.disabled = 0 ORDER BY tx_aim_configuration.title',
+ 'items' => [
+ ['label' => '', 'value' => 0],
+ ],
+ 'default' => 0,
+ ],
+ ],
+ 'grading_rubric' => [
+ 'label' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.columns.grading_rubric.label',
+ 'description' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.columns.grading_rubric.description',
+ 'displayCond' => 'FIELD:grading_enabled:REQ:true',
+ 'config' => [
+ 'type' => 'text',
+ 'rows' => 6,
+ 'cols' => 40,
+ 'placeholder' => 'LLL:EXT:aim/Resources/Private/Language/locallang_tca.xlf:tx_aim_configuration.columns.grading_rubric.placeholder',
+ ],
+ ],
],
];
diff --git a/Documentation/Introduction.md b/Documentation/Introduction.md
index 5d4036e..14c8ada 100644
--- a/Documentation/Introduction.md
+++ b/Documentation/Introduction.md
@@ -112,6 +112,8 @@ Every extension using AiM now has AI capabilities. No further configuration need
AiM analyzes each prompt's complexity before sending it to an AI provider. A simple "What is PHP?" doesn't need GPT-4.1. A smaller, cheaper model handles it just fine. AiM learns from your request history which models work well for which types of questions and automatically routes to the most cost-effective option.
+If you also enable [response quality grading](#response-quality-grading), routing gets smarter still: a cheaper model is only chosen if its past answers were actually graded as good. Not just "didn't error". A model that runs cheaply but produces weak responses is left out of the downgrade. Until enough graded requests exist for a model, routing falls back to cost and reliability alone, so nothing changes for setups that don't use grading.
+
This happens transparently. Your extensions don't need to change anything.
### Auto model switching
@@ -185,6 +187,7 @@ Every AI request is tracked in the **AiM > Request Log** module:
- **Which model answered**: requested model vs. actually used model
- **How much it cost**: token counts (prompt, completion, cached, reasoning) and calculated cost
- **How complex it was**: AiM's complexity classification (simple/moderate/complex) with the scoring reason
+- **How good it was**: when grading is enabled, an LLM-as-a-judge quality score, label, and reason
- **How long it took**: wall-clock duration in milliseconds
- **Who asked**: the backend username is displayed for each request, so you can see which user triggered it. Automated/CLI requests show no user.
- **Which extension**: the calling extension key is shown per request
@@ -194,6 +197,16 @@ Filter by provider, extension, request type, or success/failure. Statistics dash

+### Response quality grading
+
+How good are the AI responses your site produces? AiM can answer that automatically. Enable **LLM grading** on any provider configuration and AiM scores each response with a second AI model acting as an impartial judge ("LLM-as-a-judge").
+
+You write the rubric ("evaluate factual accuracy and relevance", "check the tone is friendly and professional", ...) and pick which configuration acts as the judge, typically a cheaper model. After each response is delivered, AiM asks the judge to score it and records a grade (poor / fair / good / excellent), a 0–1 score, and a one-sentence reason on the request log row.
+
+Grading runs *after* the response reaches the user, so it never slows anything down. It applies to text and conversation requests, and only when full logging is active, since the judge needs to see the content it is scoring. Grading is delivered by a shutdown handler on the live request, with a scheduler task (`aim:grade-pending`) as a safety net for anything it misses.
+
+This turns the request log into a quality dashboard: spot which models or prompts produce weak answers, compare providers on real output, and catch quality regressions before your editors do.
+
### Provider verification
Click the verify button next to any provider configuration to test the connection. See "connected" or "disconnected" with the exact error message. Results are persisted so you see the last check status on every page load.
diff --git a/README.md b/README.md
index f3066cc..bfd1875 100644
--- a/README.md
+++ b/README.md
@@ -40,15 +40,16 @@ A few lines to add AI to any TYPO3 extension. No API keys in your code, no provi
- Budget limits and rate limiting per user (including admins as a safety net)
- Privacy levels (standard / reduced / none) per provider
- Provider group restrictions and capability permissions via native TYPO3 mechanisms
+- LLM grading: score response quality with a second model acting as a judge
**Under the hood:**
- Zero provider dependencies. Install Symfony AI bridge packages as needed.
- Auto-discovery of installed bridges (OpenAI, Anthropic, Gemini, Mistral, Ollama, etc.)
- Capability-based routing with model-level awareness
- Auto model switch: one config covers all capabilities
-- Smart routing: routes simple prompts to cheaper models based on historical cost data
+- Smart routing: routes simple prompts to cheaper models based on historical cost, reliability, and (with grading) quality data
- Fallback chains: automatic retry with alternative providers on failure
-- 8-layer middleware pipeline: retry, access control, smart routing, capability validation, logging, cost tracking, events, dispatch
+- 9-layer middleware pipeline: retry, access control, smart routing, capability validation, grading, logging, cost tracking, events, dispatch
## Installation
@@ -77,6 +78,37 @@ After installation, create a provider configuration in the backend (Admin Tools
> **Local providers (Ollama, LM Studio):** The *API Key* field doubles as the endpoint URL. Enter `http://localhost:11434` (Ollama) or `http://localhost:1234` (LM Studio) instead of a key. The available models are then fetched live from that endpoint.
+## Trying AiM from the command line
+
+Once a provider configuration exists, you can fire requests without writing an extension first. The `aim:test` command sends a one-off request through the full pipeline and reports the response, model used, token usage, cost, timing, and whether a request-log row was written:
+
+```bash
+# Text generation (default capability)
+vendor/bin/typo3 aim:test text --prompt "Write a haiku about TYPO3"
+
+# Conversation, against a specific provider
+vendor/bin/typo3 aim:test conversation -p "anthropic:*" --prompt "Explain dependency injection"
+
+# Translation
+vendor/bin/typo3 aim:test translate --prompt "Hello world" --from English --to German
+
+# Embeddings
+vendor/bin/typo3 aim:test embed --prompt "TYPO3 is an open-source CMS"
+```
+
+The capability is a positional argument (`text`, `conversation`, `translate`, or `embed`; defaults to `text`). Options:
+
+| Option | Purpose |
+|---|---|
+| `--prompt` | The prompt / text to send |
+| `--provider` / `-p` | Provider notation (`openai:gpt-4o`, `anthropic:*`); defaults to the configured default |
+| `--site` | Resolve the provider from a site's `settings.yaml` instead of the database; takes precedence over `--provider` |
+| `--system-prompt` | Optional system prompt |
+| `--max-tokens` | Token limit for the response |
+| `--from` / `--to` | Source / target language (translate only) |
+
+Because it runs through the real pipeline, every call also lands in the request log. A quick way to see logging, cost tracking, smart routing, and grading in action before integrating the API into your own code.
+
## Usage
### Tier 1: Proxy (recommended)
@@ -437,6 +469,14 @@ The `SmartRoutingMiddleware` classifies prompt complexity using language-agnosti
Classification is logged per request (`complexity_score`, `complexity_label`, `complexity_reason`). When a cheaper model has proven reliable for simple prompts (based on historical request log data with minimum 10 requests and 90%+ success rate), the middleware automatically downgrades.
+### Quality gate
+
+"Reliable" on its own only means *the API call didn't error*. A cheap model can succeed every time while producing weak answers. When [LLM grading](#llm-grading) is enabled, smart routing also consults the recorded `grade_score`: a cheaper model is only chosen if its graded responses for that request type average at least **0.65** (the "good" boundary) across at least **10 graded requests**.
+
+The gate is a one-way veto, not a tie-breaker. The cheapest cost-and-success-eligible model is still the one picked; a poor average grade simply removes a candidate. Crucially, **too few graded requests means "no signal", not "bad"**: a model with fewer than 10 graded samples is judged on cost and success rate exactly as before, so installs without grading enabled see no change in routing behavior.
+
+The downgrade decision is logged with the candidate's graded quality, e.g. `... (avg grade: 0.82 over 14 graded)` or `... (ungraded)`.
+
### Extending complexity signals
Ship a `Configuration/SmartRouting/ComplexitySignals.php` in any extension:
@@ -457,6 +497,49 @@ Or add signals at runtime:
$GLOBALS['TYPO3_CONF_VARS']['EXTENSIONS']['aim']['complexitySignals']['de']['complex'][] = 'analysiere';
```
+## LLM Grading
+
+AiM can score the quality of AI responses using a second model as a judge ("LLM-as-a-judge"). Grading is opt-in per provider configuration and runs *after* the response has been delivered to the caller, so it adds no latency to the live request.
+
+### Enabling grading
+
+On any provider configuration (Admin Tools > AiM > Providers), open the **LLM Grading** tab:
+
+| Field | Purpose |
+|---|---|
+| `grading_enabled` | Turns grading on for this configuration |
+| `judge_configuration_uid` | A *different* AiM configuration used to score responses — typically a cheaper or specialized model that supports the conversation capability |
+| `grading_rubric` | The judge's instructions: what to evaluate (factual accuracy, relevance, tone, ...). The required JSON output format is appended automatically. |
+
+Grading covers `ConversationRequest` and `TextGenerationRequest`. It only runs when the effective privacy level is `standard`, `reduced` and `none` skip it, since the judge needs the prompt and response content.
+
+### How it runs
+
+1. After a successful, gradeable response, `GraderMiddleware` marks the request log row `grade_status = pending` and registers a shutdown function.
+2. The shutdown function runs *after* the response is flushed to the caller, then calls the judge model.
+3. The judge returns a JSON `{score, label, reason}`, written back to the row (`grade_score`, `grade_label`, `grade_reason`).
+
+If the shutdown path is missed (CLI crash, an unusual SAPI), a scheduler command picks up the stragglers:
+
+```bash
+vendor/bin/typo3 aim:grade-pending
+```
+
+Run it from the TYPO3 scheduler every few minutes. It grades rows still marked `pending` that are older than `--min-age` seconds (default 60), so it never races the live shutdown handler. The request log module shows a warning when a pending backlog builds up.
+
+### Grades
+
+The judge assigns one of four labels. When it returns a score but no recognizable label, the label is derived from the score:
+
+| Label | Score range |
+|---|---|
+| `poor` | 0.00–0.39 |
+| `fair` | 0.40–0.64 |
+| `good` | 0.65–0.84 |
+| `excellent` | 0.85–1.00 |
+
+The judge call deliberately bypasses the middleware pipeline (it would otherwise produce a duplicate request-log row), but its cost is still rolled into the judge configuration's `total_cost` and recorded on the graded row's `judge_cost` column.
+
## Custom Middleware
Add middleware to intercept all AI requests:
@@ -544,6 +627,7 @@ The middleware pipeline is intentionally the only logging extension point: it gi
| `AccessControlMiddleware` | 90 | Provider access, capability permissions, budgets, rate limits |
| `SmartRoutingMiddleware` | 75 | Complexity classification, cost-based model downgrade |
| `CapabilityValidationMiddleware` | 50 | Validates provider capability, auto-reroutes if needed |
+| `GraderMiddleware` | -600 | Schedules LLM-as-a-judge grading after a successful response |
| `RequestLoggingMiddleware` | -700 | Logs every request (respects privacy levels) |
| `CostTrackingMiddleware` | -800 | Updates cumulative cost per configuration |
| `EventDispatchMiddleware` | -900 | Fires `BeforeAiRequestEvent` / `AfterAiResponseEvent` |
@@ -580,6 +664,7 @@ Monitor all AI requests:
- **User tracking**: shows the backend username for each request (empty for CLI/automation)
- **Full content**: prompt, system prompt, and response content per request (respects privacy levels)
- **Complexity classification**: score, label, and reason for each request
+- **Quality grades**: LLM-as-a-judge score, label, and reason per request when grading is enabled
- **Token details**: prompt, completion, cached, and reasoning token breakdowns
- **Rerouting info**: fallback and capability rerouting details
@@ -604,7 +689,7 @@ All widgets are refreshable and grouped under "AiM" in the widget picker. The re
| Table | Purpose |
|---|---|
| `tx_aim_configuration` | Provider configurations (TCA-managed). API keys, models, cost tracking, governance settings. |
-| `tx_aim_request_log` | Per-request log (no TCA). Tokens, cost, duration, prompt/response content, complexity classification, rerouting details. |
+| `tx_aim_request_log` | Per-request log (no TCA). Tokens, cost, duration, prompt/response content, complexity classification, rerouting details, LLM grading results. |
| `tx_aim_usage_budget` | Per-user budget tracking. Rolling period counters for tokens, cost, and request count. |
See `ext_tables.sql` for the full schema.
diff --git a/Resources/Private/Language/locallang_module.xlf b/Resources/Private/Language/locallang_module.xlf
index 7af95ce..bd56a1a 100644
--- a/Resources/Private/Language/locallang_module.xlf
+++ b/Resources/Private/Language/locallang_module.xlf
@@ -190,9 +190,24 @@
Duration
+
+ Grade
+
Status
+
+ Pending
+
+
+ Grade failed
+
+
+ Pending grades
+
+
+ %d request log row(s) have been waiting more than an hour to be graded. The shutdown handler appears to have missed them — run "vendor/bin/typo3 aim:grade-pending" or schedule it via the TYPO3 scheduler.
+
Success
diff --git a/Resources/Private/Language/locallang_tca.xlf b/Resources/Private/Language/locallang_tca.xlf
index 703044f..14a2236 100644
--- a/Resources/Private/Language/locallang_tca.xlf
+++ b/Resources/Private/Language/locallang_tca.xlf
@@ -81,6 +81,33 @@
Output Token Cost (per 1M)
+
+ LLM Grading
+
+
+ Response Quality Grading
+
+
+ Enable LLM grading
+
+
+ When enabled, successful responses from this configuration are scored by a second LLM (the judge) after the response has been delivered. The score, label, and reason are stored on the request log row. Grading only runs when the effective privacy level is "standard". Reduced or none disables it.
+
+
+ Judge configuration
+
+
+ The AiM configuration used to score responses. Should be a separate, cheaper or specialized configuration that supports the conversation capability. Cannot be this configuration itself.
+
+
+ Grading rubric (judge system prompt)
+
+
+ System prompt for the judge. Describe what to evaluate (e.g. factual accuracy, relevance, tone). The judge is instructed automatically to respond with JSON containing score (0.0–1.0), label (poor|fair|good|excellent), and reason. No need to mention the output format here.
+
+
+ Evaluate the response for factual accuracy and relevance to the user's prompt.
+
diff --git a/Resources/Private/Partials/RequestLog/Row.html b/Resources/Private/Partials/RequestLog/Row.html
index 5098435..72c8eec 100644
--- a/Resources/Private/Partials/RequestLog/Row.html
+++ b/Resources/Private/Partials/RequestLog/Row.html
@@ -39,6 +39,21 @@
| {entry.cost -> f:format.number(decimals: 6)} |
{entry.duration_ms -> f:format.number(decimals: 0, thousandsSeparator: ',')} ms |
+
+
+
+
+ excellent ({entry.grade_score -> f:format.number(decimals: 2)})
+ good ({entry.grade_score -> f:format.number(decimals: 2)})
+ fair ({entry.grade_score -> f:format.number(decimals: 2)})
+ {entry.grade_label} ({entry.grade_score -> f:format.number(decimals: 2)})
+
+
+
+
+ -
+
+ |
diff --git a/Resources/Private/Partials/RequestLog/Table.html b/Resources/Private/Partials/RequestLog/Table.html
index 4b4b44e..3e9e532 100644
--- a/Resources/Private/Partials/RequestLog/Table.html
+++ b/Resources/Private/Partials/RequestLog/Table.html
@@ -13,6 +13,7 @@
|
|
|
+ |
|
diff --git a/Resources/Private/Templates/Aim/RequestLog.html b/Resources/Private/Templates/Aim/RequestLog.html
index e082d8d..054290c 100644
--- a/Resources/Private/Templates/Aim/RequestLog.html
+++ b/Resources/Private/Templates/Aim/RequestLog.html
@@ -13,6 +13,11 @@
+
+
+
+
+
diff --git a/Tests/Functional/Command/GradePendingLogsTest.php b/Tests/Functional/Command/GradePendingLogsTest.php
new file mode 100644
index 0000000..aa35325
--- /dev/null
+++ b/Tests/Functional/Command/GradePendingLogsTest.php
@@ -0,0 +1,95 @@
+get(RequestLogRepository::class);
+ $now = time();
+
+ // Old pending row — should be graded
+ $oldUid = $logRepo->log([
+ 'crdate' => $now - 120,
+ 'configuration_uid' => 1,
+ 'grade_status' => GradeStatus::Pending->value,
+ ]);
+ // Fresh pending row — should be skipped (within min-age window)
+ $freshUid = $logRepo->log([
+ 'crdate' => $now - 5,
+ 'configuration_uid' => 1,
+ 'grade_status' => GradeStatus::Pending->value,
+ ]);
+
+ $gradedUids = [];
+ $stubGrader = $this->stubGradingService(function (int $uid) use (&$gradedUids, $logRepo): void {
+ $gradedUids[] = $uid;
+ $logRepo->updateGrade($uid, 0.8, GradeLabel::Good, 'test', 'gpt-4o-mini', 0.0001, 12);
+ });
+
+ $command = new GradePendingLogs($logRepo, $stubGrader);
+ $tester = new CommandTester($command);
+ $tester->execute(['--min-age' => '60', '--limit' => '10']);
+
+ self::assertSame([$oldUid], $gradedUids);
+ self::assertSame(0, $tester->getStatusCode());
+
+ $oldRow = $logRepo->findByUid($oldUid);
+ self::assertSame(GradeStatus::Done->value, $oldRow['grade_status']);
+
+ $freshRow = $logRepo->findByUid($freshUid);
+ self::assertSame(GradeStatus::Pending->value, $freshRow['grade_status']);
+ }
+
+ #[Test]
+ public function reportsNoPendingRowsWhenEmpty(): void
+ {
+ $logRepo = $this->get(RequestLogRepository::class);
+ $stubGrader = $this->stubGradingService(static function () {});
+
+ $command = new GradePendingLogs($logRepo, $stubGrader);
+ $tester = new CommandTester($command);
+ $tester->execute([]);
+
+ self::assertSame(0, $tester->getStatusCode());
+ self::assertStringContainsString('No pending grades', $tester->getDisplay());
+ }
+
+ private function stubGradingService(\Closure $onGrade): GradingService
+ {
+ return new class($onGrade) extends GradingService {
+ // @phpstan-ignore-next-line — overriding constructor on purpose
+ public function __construct(private readonly \Closure $onGrade) {}
+
+ public function grade(int $logUid): void
+ {
+ ($this->onGrade)($logUid);
+ }
+ };
+ }
+}
diff --git a/Tests/Functional/Domain/Repository/RequestLogRepositoryTest.php b/Tests/Functional/Domain/Repository/RequestLogRepositoryTest.php
new file mode 100644
index 0000000..bb00780
--- /dev/null
+++ b/Tests/Functional/Domain/Repository/RequestLogRepositoryTest.php
@@ -0,0 +1,92 @@
+get(RequestLogRepository::class);
+
+ // Three graded "done" rows for cheap-model: scores 0.6, 0.8, 1.0 → avg 0.8
+ foreach ([0.6, 0.8, 1.0] as $score) {
+ $logRepo->log($this->row('cheap-model', 0.5, GradeStatus::Done, $score));
+ }
+ // One failed and one ungraded row — must be excluded from the grade average
+ $logRepo->log($this->row('cheap-model', 0.4, GradeStatus::Failed, 0.0));
+ $logRepo->log($this->row('cheap-model', 0.4, GradeStatus::None, 0.0));
+
+ $profiles = $logRepo->getModelPerformanceProfile('TextGenerationRequest');
+ $cheap = $this->profileFor($profiles, 'cheap-model');
+
+ self::assertSame(5, $cheap['request_count']);
+ self::assertSame(3, $cheap['graded_count']);
+ self::assertEqualsWithDelta(0.8, $cheap['avg_grade_score'], 0.0001);
+ }
+
+ #[Test]
+ public function modelPerformanceProfileReportsZeroGradesForUngradedModel(): void
+ {
+ $logRepo = $this->get(RequestLogRepository::class);
+ $logRepo->log($this->row('ungraded-model', 0.5, GradeStatus::None, 0.0));
+ $logRepo->log($this->row('ungraded-model', 0.5, GradeStatus::None, 0.0));
+
+ $profiles = $logRepo->getModelPerformanceProfile('TextGenerationRequest');
+ $model = $this->profileFor($profiles, 'ungraded-model');
+
+ self::assertSame(2, $model['request_count']);
+ self::assertSame(0, $model['graded_count']);
+ self::assertSame(0.0, $model['avg_grade_score']);
+ }
+
+ private function row(string $model, float $cost, GradeStatus $status, float $gradeScore): array
+ {
+ return [
+ 'crdate' => time(),
+ 'request_type' => 'TextGenerationRequest',
+ 'provider_identifier' => 'test',
+ 'model_used' => $model,
+ 'success' => 1,
+ 'cost' => $cost,
+ 'total_tokens' => 100,
+ 'grade_status' => $status->value,
+ 'grade_score' => $gradeScore,
+ 'grade_label' => $status === GradeStatus::Done ? GradeLabel::fromScore($gradeScore)->value : '',
+ ];
+ }
+
+ /**
+ * @param list> $profiles
+ * @return array
+ */
+ private function profileFor(array $profiles, string $model): array
+ {
+ foreach ($profiles as $profile) {
+ if ($profile['model_used'] === $model) {
+ return $profile;
+ }
+ }
+ self::fail('No performance profile for model "' . $model . '".');
+ }
+}
diff --git a/Tests/Functional/Middleware/GraderMiddlewareTest.php b/Tests/Functional/Middleware/GraderMiddlewareTest.php
new file mode 100644
index 0000000..8729c6b
--- /dev/null
+++ b/Tests/Functional/Middleware/GraderMiddlewareTest.php
@@ -0,0 +1,265 @@
+get(RequestLogRepository::class);
+ $logUid = $logRepo->log([
+ 'crdate' => time(),
+ 'request_type' => 'ConversationRequest',
+ 'provider_identifier' => 'openai',
+ 'configuration_uid' => 1,
+ 'success' => 1,
+ 'response_content' => 'a response',
+ ]);
+ self::assertGreaterThan(0, $logUid);
+
+ $context = new RequestContext();
+ $context->logUid = $logUid;
+
+ $stubGrader = $this->buildStubGradingService();
+ $middleware = new GraderMiddleware($stubGrader, $logRepo, new NullLogger());
+
+ $config = $this->buildConfiguration([
+ 'grading_enabled' => 1,
+ 'judge_configuration_uid' => 99,
+ 'privacy_level' => 'standard',
+ ]);
+
+ $request = $this->buildConversationRequest($config);
+ $response = new TextResponse('hello', errors: []);
+
+ $next = new AiMiddlewareHandler(
+ static fn() => $response,
+ $context,
+ );
+
+ $middleware->process($request, $this->stubProvider(), $config, $next);
+
+ $row = $logRepo->findByUid($logUid);
+ self::assertSame(GradeStatus::Pending->value, $row['grade_status']);
+ }
+
+ #[Test]
+ public function doesNotMarkPendingWhenGradingDisabled(): void
+ {
+ $logRepo = $this->get(RequestLogRepository::class);
+ $logUid = $logRepo->log(['crdate' => time(), 'configuration_uid' => 1]);
+
+ $context = new RequestContext();
+ $context->logUid = $logUid;
+ $middleware = new GraderMiddleware($this->buildStubGradingService(), $logRepo, new NullLogger());
+
+ $config = $this->buildConfiguration(['grading_enabled' => 0]);
+ $request = $this->buildConversationRequest($config);
+ $next = new AiMiddlewareHandler(static fn() => new TextResponse('x'), $context);
+
+ $middleware->process($request, $this->stubProvider(), $config, $next);
+
+ $row = $logRepo->findByUid($logUid);
+ self::assertSame(GradeStatus::None->value, $row['grade_status']);
+ }
+
+ #[Test]
+ public function doesNotMarkPendingWhenRecursionFlagIsSet(): void
+ {
+ $logRepo = $this->get(RequestLogRepository::class);
+ $logUid = $logRepo->log(['crdate' => time(), 'configuration_uid' => 1]);
+
+ $context = new RequestContext();
+ $context->logUid = $logUid;
+ $middleware = new GraderMiddleware($this->buildStubGradingService(), $logRepo, new NullLogger());
+
+ $config = $this->buildConfiguration([
+ 'grading_enabled' => 1,
+ 'judge_configuration_uid' => 99,
+ ]);
+ $request = new ConversationRequest(
+ configuration: $config,
+ messages: [new UserMessage('hi')],
+ metadata: ['_aim_grading' => true],
+ );
+ $next = new AiMiddlewareHandler(static fn() => new TextResponse('x'), $context);
+
+ $middleware->process($request, $this->stubProvider(), $config, $next);
+
+ $row = $logRepo->findByUid($logUid);
+ self::assertSame(GradeStatus::None->value, $row['grade_status']);
+ }
+
+ #[Test]
+ public function doesNotMarkPendingWhenPrivacyIsReduced(): void
+ {
+ $logRepo = $this->get(RequestLogRepository::class);
+ $logUid = $logRepo->log(['crdate' => time(), 'configuration_uid' => 1]);
+
+ $context = new RequestContext();
+ $context->logUid = $logUid;
+ $middleware = new GraderMiddleware($this->buildStubGradingService(), $logRepo, new NullLogger());
+
+ $config = $this->buildConfiguration([
+ 'grading_enabled' => 1,
+ 'judge_configuration_uid' => 99,
+ 'privacy_level' => 'reduced',
+ ]);
+ $request = $this->buildConversationRequest($config);
+ $next = new AiMiddlewareHandler(static fn() => new TextResponse('x'), $context);
+
+ $middleware->process($request, $this->stubProvider(), $config, $next);
+
+ $row = $logRepo->findByUid($logUid);
+ self::assertSame(GradeStatus::None->value, $row['grade_status']);
+ }
+
+ #[Test]
+ public function doesNotMarkPendingForEmbeddingRequests(): void
+ {
+ $logRepo = $this->get(RequestLogRepository::class);
+ $logUid = $logRepo->log(['crdate' => time(), 'configuration_uid' => 1]);
+
+ $context = new RequestContext();
+ $context->logUid = $logUid;
+ $middleware = new GraderMiddleware($this->buildStubGradingService(), $logRepo, new NullLogger());
+
+ $config = $this->buildConfiguration([
+ 'grading_enabled' => 1,
+ 'judge_configuration_uid' => 99,
+ ]);
+ $request = new EmbeddingRequest(
+ configuration: $config,
+ input: ['some text'],
+ );
+ $next = new AiMiddlewareHandler(static fn() => new TextResponse('x'), $context);
+
+ $middleware->process($request, $this->stubProvider(), $config, $next);
+
+ $row = $logRepo->findByUid($logUid);
+ self::assertSame(GradeStatus::None->value, $row['grade_status']);
+ }
+
+ #[Test]
+ public function doesNotMarkPendingWhenResponseFailed(): void
+ {
+ $logRepo = $this->get(RequestLogRepository::class);
+ $logUid = $logRepo->log(['crdate' => time(), 'configuration_uid' => 1]);
+
+ $context = new RequestContext();
+ $context->logUid = $logUid;
+ $middleware = new GraderMiddleware($this->buildStubGradingService(), $logRepo, new NullLogger());
+
+ $config = $this->buildConfiguration([
+ 'grading_enabled' => 1,
+ 'judge_configuration_uid' => 99,
+ ]);
+ $request = $this->buildConversationRequest($config);
+ $failedResponse = new TextResponse('', errors: ['boom']);
+ $next = new AiMiddlewareHandler(static fn() => $failedResponse, $context);
+
+ $middleware->process($request, $this->stubProvider(), $config, $next);
+
+ $row = $logRepo->findByUid($logUid);
+ self::assertSame(GradeStatus::None->value, $row['grade_status']);
+ }
+
+ #[Test]
+ public function doesNotMarkPendingWhenJudgeIsSameConfiguration(): void
+ {
+ $logRepo = $this->get(RequestLogRepository::class);
+ $logUid = $logRepo->log(['crdate' => time(), 'configuration_uid' => 5]);
+
+ $context = new RequestContext();
+ $context->logUid = $logUid;
+ $middleware = new GraderMiddleware($this->buildStubGradingService(), $logRepo, new NullLogger());
+
+ $config = $this->buildConfiguration([
+ 'uid' => 5,
+ 'grading_enabled' => 1,
+ 'judge_configuration_uid' => 5,
+ ]);
+ $request = $this->buildConversationRequest($config);
+ $next = new AiMiddlewareHandler(static fn() => new TextResponse('ok'), $context);
+
+ $middleware->process($request, $this->stubProvider(), $config, $next);
+
+ $row = $logRepo->findByUid($logUid);
+ self::assertSame(GradeStatus::None->value, $row['grade_status']);
+ }
+
+ private function buildConfiguration(array $overrides): ProviderConfiguration
+ {
+ $base = [
+ 'uid' => 1,
+ 'ai_provider' => 'openai',
+ 'title' => 'Test',
+ 'api_key' => 'sk-test',
+ 'model' => 'gpt-4o-mini',
+ 'privacy_level' => 'standard',
+ 'grading_enabled' => 0,
+ 'judge_configuration_uid' => 0,
+ 'grading_rubric' => '',
+ ];
+ return new ProviderConfiguration(array_merge($base, $overrides));
+ }
+
+ private function buildConversationRequest(ProviderConfiguration $config): ConversationRequest
+ {
+ return new ConversationRequest(
+ configuration: $config,
+ messages: [new UserMessage('hi')],
+ );
+ }
+
+ private function stubProvider(): AiProviderInterface
+ {
+ return new class implements AiProviderInterface {};
+ }
+
+ /**
+ * A no-op grading service so register_shutdown_function doesn't actually
+ * try to dispatch a judge request at end of test.
+ */
+ private function buildStubGradingService(): GradingService
+ {
+ return new class extends GradingService {
+ // @phpstan-ignore-next-line — overriding constructor on purpose
+ public function __construct() {}
+
+ public function grade(int $logUid): void {}
+ };
+ }
+}
diff --git a/Tests/Functional/Service/GradingServiceTest.php b/Tests/Functional/Service/GradingServiceTest.php
new file mode 100644
index 0000000..4ccaa46
--- /dev/null
+++ b/Tests/Functional/Service/GradingServiceTest.php
@@ -0,0 +1,153 @@
+get(RequestLogRepository::class);
+ $uid = $logRepo->log([
+ 'crdate' => time(),
+ 'configuration_uid' => 1,
+ 'grade_status' => 'pending',
+ ]);
+
+ $logRepo->updateGrade(
+ $uid,
+ score: 0.83,
+ label: GradeLabel::Good,
+ reason: 'Mostly correct, minor omissions.',
+ judgeModel: 'gpt-4o-mini',
+ judgeCost: 0.000123,
+ durationMs: 280,
+ );
+
+ $row = $logRepo->findByUid($uid);
+ self::assertSame(GradeStatus::Done->value, $row['grade_status']);
+ self::assertEqualsWithDelta(0.83, (float)$row['grade_score'], 0.001);
+ self::assertSame('good', $row['grade_label']);
+ self::assertStringContainsString('omissions', $row['grade_reason']);
+ self::assertSame('gpt-4o-mini', $row['judge_model']);
+ self::assertEqualsWithDelta(0.000123, (float)$row['judge_cost'], 1e-6);
+ self::assertSame(280, (int)$row['grade_duration_ms']);
+ self::assertSame('', $row['grade_error']);
+ }
+
+ #[Test]
+ public function repositoryMarkGradeFailedClampsErrorLength(): void
+ {
+ $logRepo = $this->get(RequestLogRepository::class);
+ $uid = $logRepo->log([
+ 'crdate' => time(),
+ 'configuration_uid' => 1,
+ 'grade_status' => 'pending',
+ ]);
+
+ $logRepo->markGradeFailed($uid, str_repeat('x', 800));
+
+ $row = $logRepo->findByUid($uid);
+ self::assertSame(GradeStatus::Failed->value, $row['grade_status']);
+ self::assertSame(500, mb_strlen($row['grade_error']));
+ }
+
+ #[Test]
+ public function findPendingGradesRespectsAgeFilter(): void
+ {
+ $logRepo = $this->get(RequestLogRepository::class);
+ $now = time();
+
+ $oldUid = $logRepo->log([
+ 'crdate' => $now - 600,
+ 'configuration_uid' => 1,
+ 'grade_status' => 'pending',
+ ]);
+ $logRepo->log([
+ 'crdate' => $now - 10,
+ 'configuration_uid' => 1,
+ 'grade_status' => 'pending',
+ ]);
+ $logRepo->log([
+ 'crdate' => $now - 600,
+ 'configuration_uid' => 1,
+ 'grade_status' => 'done',
+ ]);
+
+ $found = $logRepo->findPendingGrades(60, 100);
+ self::assertCount(1, $found);
+ self::assertSame($oldUid, (int)$found[0]['uid']);
+
+ $pendingCount = $logRepo->countPendingGradesOlderThan(60);
+ self::assertSame(1, $pendingCount);
+ }
+
+ #[Test]
+ public function parseJudgeOutputAcceptsCleanJson(): void
+ {
+ $parsed = $this->invokeParser('{"score": 0.75, "label": "good", "reason": "ok"}');
+ self::assertSame(0.75, $parsed['score']);
+ self::assertSame(GradeLabel::Good, $parsed['label']);
+ self::assertSame('ok', $parsed['reason']);
+ }
+
+ #[Test]
+ public function parseJudgeOutputStripsMarkdownFences(): void
+ {
+ $parsed = $this->invokeParser("```json\n{\"score\": 0.9, \"label\": \"excellent\", \"reason\": \"great\"}\n```");
+ self::assertSame(0.9, $parsed['score']);
+ self::assertSame(GradeLabel::Excellent, $parsed['label']);
+ }
+
+ #[Test]
+ public function parseJudgeOutputClampsOutOfRangeScores(): void
+ {
+ $tooHigh = $this->invokeParser('{"score": 1.5, "label": "excellent", "reason": "x"}');
+ self::assertSame(1.0, $tooHigh['score']);
+
+ $tooLow = $this->invokeParser('{"score": -0.3, "label": "poor", "reason": "x"}');
+ self::assertSame(0.0, $tooLow['score']);
+ }
+
+ #[Test]
+ public function parseJudgeOutputBackfillsLabelFromScoreWhenInvalid(): void
+ {
+ $parsed = $this->invokeParser('{"score": 0.7, "label": "meh", "reason": "x"}');
+ self::assertSame(GradeLabel::Good, $parsed['label']);
+ }
+
+ #[Test]
+ public function parseJudgeOutputReturnsNullOnMalformedJson(): void
+ {
+ self::assertNull($this->invokeParser('not json at all'));
+ self::assertNull($this->invokeParser('{"missing": "score"}'));
+ }
+
+ private function invokeParser(string $raw): ?array
+ {
+ $service = $this->get(GradingService::class);
+ $method = new \ReflectionMethod($service, 'parseJudgeOutput');
+ return $method->invoke($service, $raw);
+ }
+}
diff --git a/ext_tables.sql b/ext_tables.sql
index 7167b24..fc35ef9 100644
--- a/ext_tables.sql
+++ b/ext_tables.sql
@@ -14,7 +14,10 @@ CREATE TABLE tx_aim_configuration (
be_groups varchar(255) DEFAULT '' NOT NULL,
privacy_level varchar(20) DEFAULT 'standard' NOT NULL,
rerouting_allowed tinyint(1) unsigned DEFAULT '1' NOT NULL,
- auto_model_switch tinyint(1) unsigned DEFAULT '1' NOT NULL
+ auto_model_switch tinyint(1) unsigned DEFAULT '1' NOT NULL,
+ grading_enabled tinyint(1) unsigned DEFAULT '0' NOT NULL,
+ judge_configuration_uid int(11) unsigned DEFAULT '0' NOT NULL,
+ grading_rubric text
);
CREATE TABLE tx_aim_usage_budget (
@@ -61,6 +64,14 @@ CREATE TABLE tx_aim_request_log (
rerouted tinyint(1) unsigned DEFAULT '0' NOT NULL,
reroute_type varchar(20) DEFAULT '' NOT NULL,
reroute_reason varchar(255) DEFAULT '' NOT NULL,
+ grade_status varchar(20) DEFAULT 'none' NOT NULL,
+ grade_score double(5,4) DEFAULT '0.0000' NOT NULL,
+ grade_label varchar(20) DEFAULT '' NOT NULL,
+ grade_reason text,
+ judge_model varchar(255) DEFAULT '' NOT NULL,
+ judge_cost double(10,6) DEFAULT '0.000000' NOT NULL,
+ grade_duration_ms int(11) unsigned DEFAULT '0' NOT NULL,
+ grade_error varchar(500) DEFAULT '' NOT NULL,
PRIMARY KEY (uid),
KEY crdate (crdate),
@@ -69,5 +80,6 @@ CREATE TABLE tx_aim_request_log (
KEY configuration_uid (configuration_uid),
KEY user_id (user_id),
KEY model_used (model_used),
- KEY request_type (request_type)
+ KEY request_type (request_type),
+ KEY grade_status (grade_status)
);
|