Skip to content

bug: requirements=["Answer must be between 100-200 words"] doesn't work reliably with granite4.1:3b #1108

@akihikokuroda

Description

@akihikokuroda

The validation of the requirement("Answer must be between 100-200 words") fails when the model generates a valid response with granite4.1:3b. It works with "llama3.2:3b" correctly.

"""Simple Mellea program with LLM-as-judge requirement and sampling strategy."""

import json
from mellea import start_session, generative
from mellea.core import Requirement
from mellea.plugins import HookType, PluginMode, hook
from mellea.stdlib.sampling import RejectionSamplingStrategy


@hook(HookType.VALIDATION_PRE_CHECK, mode=PluginMode.CONCURRENT)
async def log_validation_inputs(payload, _):
    """Print what is passed into requirement validation."""
    print("\n📋 === VALIDATION PRE-CHECK HOOK ===")
    print(f"Number of requirements: {len(payload.requirements)}")
    for i, req in enumerate(payload.requirements, 1):
        print(f"  [{i}] {req}")
    print(f"Target being validated: {payload.target}")
    print(f"word_count: {len(payload.target.value.split())}")
    print(f"Context type: {type(payload.context).__name__}")
    if payload.context:
        print(f"Context blocks: {len(getattr(payload.context, 'blocks', []))} block(s)")
    print(f"Model options: {json.dumps(payload.model_options, default=str, indent=2)}")
    print("=" * 40 + "\n")


def main():
    from mellea.plugins import PluginSet

    # Create a plugin set with the validation hook
    validation_logger = PluginSet("validation-logger", [log_validation_inputs])

    #session = start_session(model_id="llama3.2:3b", plugins=[validation_logger])
    session = start_session(model_id="granite4.1:3b", plugins=[validation_logger])

    question = "What is machine learning and how does it work?"

    print("Question:", question)
    print(
        "\nGenerating 3 answers with LLM-as-judge validation (rejection sampling)...\n"
    )

    # Use rejection sampling strategy: generate up to 3 samples, keep the first passing one
    strategy = RejectionSamplingStrategy(loop_budget=3)

    answer = session.instruct(
        f"Answer the following question in 100-200 words: {question}",
        requirements=["Answer must be between 100-200 words"],
        strategy=strategy,
    )

    # Get the value from the thunk
    answer_text = answer.value if hasattr(answer, "value") else str(answer)
    word_count = len(answer_text.split())

    print(f"Answer ({word_count} words):\n{answer_text}\n")
    print("✓ Answer approved by LLM-as-judge (100-200 words requirement)")
    print("✓ Generated via rejection sampling (best of 3 attempts)")


if __name__ == "__main__":
    main()

Metadata

Metadata

Assignees

No one assigned

    Labels

    area/backendsProvider-specific work: Ollama, HF, LiteLLM, OpenAI, Bedrock, vLLMarea/requirementRequirement base class, validation, repair loopsbugSomething isn't working

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions