The validation of the requirement("Answer must be between 100-200 words") fails when the model generates a valid response with granite4.1:3b. It works with "llama3.2:3b" correctly.
"""Simple Mellea program with LLM-as-judge requirement and sampling strategy."""
import json
from mellea import start_session, generative
from mellea.core import Requirement
from mellea.plugins import HookType, PluginMode, hook
from mellea.stdlib.sampling import RejectionSamplingStrategy
@hook(HookType.VALIDATION_PRE_CHECK, mode=PluginMode.CONCURRENT)
async def log_validation_inputs(payload, _):
"""Print what is passed into requirement validation."""
print("\n📋 === VALIDATION PRE-CHECK HOOK ===")
print(f"Number of requirements: {len(payload.requirements)}")
for i, req in enumerate(payload.requirements, 1):
print(f" [{i}] {req}")
print(f"Target being validated: {payload.target}")
print(f"word_count: {len(payload.target.value.split())}")
print(f"Context type: {type(payload.context).__name__}")
if payload.context:
print(f"Context blocks: {len(getattr(payload.context, 'blocks', []))} block(s)")
print(f"Model options: {json.dumps(payload.model_options, default=str, indent=2)}")
print("=" * 40 + "\n")
def main():
from mellea.plugins import PluginSet
# Create a plugin set with the validation hook
validation_logger = PluginSet("validation-logger", [log_validation_inputs])
#session = start_session(model_id="llama3.2:3b", plugins=[validation_logger])
session = start_session(model_id="granite4.1:3b", plugins=[validation_logger])
question = "What is machine learning and how does it work?"
print("Question:", question)
print(
"\nGenerating 3 answers with LLM-as-judge validation (rejection sampling)...\n"
)
# Use rejection sampling strategy: generate up to 3 samples, keep the first passing one
strategy = RejectionSamplingStrategy(loop_budget=3)
answer = session.instruct(
f"Answer the following question in 100-200 words: {question}",
requirements=["Answer must be between 100-200 words"],
strategy=strategy,
)
# Get the value from the thunk
answer_text = answer.value if hasattr(answer, "value") else str(answer)
word_count = len(answer_text.split())
print(f"Answer ({word_count} words):\n{answer_text}\n")
print("✓ Answer approved by LLM-as-judge (100-200 words requirement)")
print("✓ Generated via rejection sampling (best of 3 attempts)")
if __name__ == "__main__":
main()
The validation of the requirement("Answer must be between 100-200 words") fails when the model generates a valid response with granite4.1:3b. It works with "llama3.2:3b" correctly.