Source code for l2p.feedback_builder

"""
PDDL Feedback Generation Functions

This module defines the `FeedbackBuilder` class for constructing structured
feedback loops during PDDL generation using LLMs. It provides default
implementations using prompts from `l2p/templates/feedback/`.
Users can subclass to add custom feedback methods or override hooks.
"""

import json
import time
from typing import Any, Dict, Optional, Tuple, Type, Union

from l2p.llm import BaseLLM, require_llm
from l2p.utils.pddl_format import *
from l2p.utils.pddl_prompt import DEF_FB_PROMPTS, build_ctx, safe_format
from l2p.utils.pddl_parser import parse_xml_tags, parse_component
from l2p.domain_builder import DomainBuilder
from l2p.problem_builder import ProblemBuilder
from l2p.planner_builder import PlanningResult



[docs]
class FeedbackBuilder:


[docs]
    def normalize_artifacts(
        self,
        artifact: Dict[Type[BaseModel], List[BaseModel]] | List[BaseModel] | BaseModel,
    ) -> str:
        """Normalizes the artifact into clean JSON wrapped in its corresponding XML tags."""
        injected_strings = []

        if isinstance(artifact, BaseModel):
            tag = artifact.__class__.tag
            tag_name = tag[0] if isinstance(tag, (list, tuple)) else tag
            data = artifact.model_dump(exclude_none=True)
            json_str = json.dumps(data, indent=2)
            injected_strings.append(f"<{tag_name}>\n{json_str}\n</{tag_name}>")

        elif isinstance(artifact, list) and len(artifact) > 0:
            grouped = {}
            for item in artifact:
                cls = item.__class__
                if cls not in grouped:
                    grouped[cls] = []
                grouped[cls].append(item.model_dump(exclude_none=True))

            for cls, models in grouped.items():
                tag = cls.tag
                tag_name = tag[0] if isinstance(tag, (list, tuple)) else tag
                json_str = json.dumps(models, indent=2)
                injected_strings.append(f"<{tag_name}>\n{json_str}\n</{tag_name}>")

        elif isinstance(artifact, dict):
            for cls, models in artifact.items():
                tag = cls.tag
                tag_name = tag[0] if isinstance(tag, (list, tuple)) else tag
                data = [item.model_dump(exclude_none=True) for item in models]
                json_str = json.dumps(data, indent=2)
                injected_strings.append(f"<{tag_name}>\n{json_str}\n</{tag_name}>")

        else:
            raise ValueError("Unsupported artifact type provided or empty list.")

        return "\n\n".join(injected_strings)


    def _run_feedback(
        self, model: BaseLLM, xml_tag: str, prompt: str, max_retries: int
    ) -> Tuple[Union[Dict[str, Any], str], str]:
        """Executes the LLM prompt and extracts the XML block."""
        for attempt in range(max_retries):
            try:
                model.reset_tokens()
                llm_output = model.query(prompt=prompt)

                if blocks := parse_xml_tags(llm_output, xml_tag):
                    return blocks[0], llm_output
                raise ValueError(f"[ERROR] Missing <{xml_tag}> block in LLM output.")

            except Exception as e:
                print(
                    f"Error encountered during attempt {attempt + 1}/{max_retries}: {e}.\nLLM Output:\n{llm_output if 'llm_output' in locals() else 'None'}\nRetrying..."
                )
                time.sleep(2)

        raise RuntimeError(
            f"Max retries ({max_retries}) exceeded for '{xml_tag}' feedback."
        )

    def _build_and_run(
        self,
        model: BaseLLM,
        xml_tag: Optional[str],
        default_tag: str,
        prompt_template: Optional[str],
        default_template: str,
        max_retries: int,
        description: Optional[str],
        ctx_kwargs: Dict[str, Any],
        **format_kwargs,
    ) -> Tuple[Union[Dict[str, Any], str], str]:
        """Core abstraction to format standard prompt templates and run the feedback loop."""
        tag = xml_tag or default_tag
        prompt = safe_format(
            template=prompt_template or default_template,
            xml_tag=tag,
            description=description,
            context=build_ctx(**ctx_kwargs),
            **format_kwargs,
        )

        return self._run_feedback(model, tag, prompt, max_retries)

    # ------------------------------------------------------------------
    # Public feedback methods
    # ------------------------------------------------------------------


[docs]
    @require_llm
    def llm_diagnose(
        self,
        model: BaseLLM,
        artifact: Any,
        errors: Union[List[str], str],
        xml_tag: Optional[str] = None,
        prompt_template: Optional[str] = None,
        description: Optional[str] = None,
        max_retries: int = 3,
        **kwargs: Any,
    ) -> Tuple[Union[Dict[str, Any], str], str]:
        """
        Diagnoses syntax or structural errors in a generated PDDL component.

        The LLM analyzes deterministic Python errors alongside the failed JSON
        artifact to output a root-cause diagnosis and repair plan. It does not
        generate corrected PDDL code.

        Args:
            model (BaseLLM): The LLM engine to use.
            artifact (Any): The failed Pydantic model(s) that caused the error.
            errors (Union[List[str], str]): The traceback or validation errors.
            xml_tag (Optional[str]): The XML tag expected in the LLM response.
            prompt_template (Optional[str]): Custom prompt template string.
            description (Optional[str]): Original NL description of the task.
            max_retries (int): Maximum attempts to query the LLM.
            **kwargs: Additional context variables for the prompt.

        Returns:
            Tuple[Union[Dict[str, Any], str], str]:
                - The parsed dictionary/string from inside the XML tag.
                - The full raw text generated by the LLM.
        """

        return self._build_and_run(
            model=model,
            xml_tag=xml_tag,
            default_tag="diagnosis",
            prompt_template=prompt_template,
            default_template=DEF_FB_PROMPTS.diagnosis,
            max_retries=max_retries,
            description=description,
            ctx_kwargs=kwargs,
            errors="\n".join(errors) if isinstance(errors, list) else errors,
            artifact=self.normalize_artifacts(artifact),
        )



[docs]
    @require_llm
    def llm_evaluate(
        self,
        model: BaseLLM,
        artifact: Any,
        xml_tag: Optional[str] = None,
        prompt_template: Optional[str] = None,
        description: Optional[str] = None,
        max_retries: int = 3,
        **kwargs: Any,
    ) -> Tuple[Union[Dict[str, Any], str], str]:
        """
        Evaluates a generated PDDL component against natural language instructions.

        Acts as a semantic judge to determine if the generated code fulfills the
        user's original intent, even if the PDDL syntax is technically correct.

        Args:
            model (BaseLLM): The LLM engine to use.
            artifact (Any): The Pydantic model(s) to evaluate.
            xml_tag (Optional[str]): The XML tag expected in the LLM response.
            prompt_template (Optional[str]): Custom prompt template string.
            description (Optional[str]): Original NL description of the task.
            max_retries (int): Maximum attempts to query the LLM.
            **kwargs: Additional context variables for the prompt.

        Returns:
            Tuple[Union[Dict[str, Any], str], str]:
                - The parsed dictionary/string from inside the XML tag.
                - The full raw text generated by the LLM.
        """

        return self._build_and_run(
            model=model,
            xml_tag=xml_tag,
            default_tag="evaluation",
            prompt_template=prompt_template,
            default_template=DEF_FB_PROMPTS.evaluate,
            max_retries=max_retries,
            description=description,
            ctx_kwargs=kwargs,
            artifact=self.normalize_artifacts(artifact),
        )



[docs]
    @require_llm
    def llm_reflect(
        self,
        model: BaseLLM,
        artifact: Any,
        xml_tag: Optional[str] = None,
        prompt_template: Optional[str] = None,
        diagnosis: Optional[str] = None,
        description: Optional[str] = None,
        max_retries: int = 3,
        **kwargs: Any,
    ) -> Tuple[Union[Dict[str, Any], str], str]:
        """
        Generates generalized lessons learned from a failure.

        Translates a specific diagnostic repair plan into a durable rule that
        can be stored in memory to prevent the LLM from repeating the mistake.

        Args:
            model (BaseLLM): The LLM engine to use.
            artifact (Any): The failed Pydantic model(s).
            xml_tag (Optional[str]): The XML tag expected in the LLM response.
            prompt_template (Optional[str]): Custom prompt template string.
            diagnosis (Optional[str]): The root-cause diagnosis of the failure.
            description (Optional[str]): Original NL description of the task.
            max_retries (int): Maximum attempts to query the LLM.
            **kwargs: Additional context variables for the prompt.

        Returns:
            Tuple[Union[Dict[str, Any], str], str]:
                - The parsed dictionary/string from inside the XML tag.
                - The full raw text generated by the LLM.
        """

        return self._build_and_run(
            model=model,
            xml_tag=xml_tag,
            default_tag="reflection",
            prompt_template=prompt_template,
            default_template=DEF_FB_PROMPTS.reflection,
            max_retries=max_retries,
            description=description,
            ctx_kwargs=kwargs,
            diagnosis=diagnosis,
            artifact=self.normalize_artifacts(artifact),
        )



[docs]
    @require_llm
    def llm_revise(
        self,
        model: BaseLLM,
        artifact: Any,
        component_class: Union[Type[BaseModel], List[Type[BaseModel]]],
        prompt_template: Optional[str] = None,
        diagnosis: Optional[str] = None,
        description: Optional[str] = None,
        max_retries: int = 3,
        **kwargs: Any,
    ) -> Tuple[Dict[Type[BaseModel], List[BaseModel]], str]:
        """
        Revises a broken PDDL component based on a repair plan and parses the output
        back into explicit Pydantic models.

        Args:
            model (BaseLLM): The LLM engine to use.
            artifact (Any): The failed Pydantic model(s) to fix.
            component_class (Union[Type[BaseModel], List[Type[BaseModel]]]): The expected
                Pydantic classes to extract from the LLM's revised output.
            prompt_template (Optional[str]): Custom prompt template string.
            diagnosis (Optional[str]): The specific repair plan to follow.
            description (Optional[str]): Original NL description of the task.
            max_retries (int): Maximum attempts to query the LLM.
            **kwargs: Additional context variables for the prompt.

        Returns:
            Tuple[Dict[Type[BaseModel], List[BaseModel]], str]:
                - A dictionary mapping each requested component class to its parsed instances.
                - The full raw text generated by the LLM.
        """

        prompt_template = prompt_template if prompt_template else DEF_FB_PROMPTS.revise
        classes = (
            component_class if isinstance(component_class, list) else [component_class]
        )

        prompt = safe_format(
            template=prompt_template,
            description=description,
            context=build_ctx(**kwargs),
            diagnosis=diagnosis,
            artifact=self.normalize_artifacts(artifact),
        )

        for attempt in range(max_retries):
            try:
                model.reset_tokens()
                llm_output = model.query(prompt=prompt)
                results = {}

                # iterate over each class the LLM was supposed to fix and extract it
                for cls in classes:
                    tags = [cls.tag] if isinstance(cls.tag, str) else cls.tag

                    for t in tags:
                        if raw_blocks := parse_xml_tags(
                            llm_output=llm_output, tag_name=t
                        ):
                            results[cls] = parse_component(
                                raw_blocks=raw_blocks, model_class=cls, tag_name=t
                            )
                            break
                    else:
                        raise ValueError(
                            f"[ERROR] Missing expected XML block in LLM output. Looked for: {cls.tag}"
                        )

                return results, llm_output

            except Exception as e:
                print(
                    f"Error encountered during attempt {attempt + 1}/{max_retries}: {e}. "
                    f"\nLLM Output: \n\n{llm_output if 'llm_output' in locals() else 'None'}\n\n Retrying..."
                )
                time.sleep(2)

        raise RuntimeError(
            "Max retries exceeded. Failed to revise and extract components."
        )



[docs]
    @require_llm
    def llm_select(
        self,
        model: BaseLLM,
        candidates: List[BaseModel],
        xml_tag: Optional[str] = None,
        prompt_template: Optional[str] = None,
        description: Optional[str] = None,
        max_retries: int = 3,
        **kwargs: Any,
    ) -> Tuple[Union[Dict[str, Any], str], str]:
        """
        Selects the best PDDL candidate from a generated pool.

        The LLM acts as an arbitration agent, comparing multiple options and
        returning the ID of the highest quality output along with reasoning.

        Args:
            model (BaseLLM): The LLM engine to use.
            candidates (List[BaseModel]): A list of alternative generations.
            xml_tag (Optional[str]): The XML tag expected in the LLM response.
            prompt_template (Optional[str]): Custom prompt template string.
            description (Optional[str]): Original NL description of the task.
            max_retries (int): Maximum attempts to query the LLM.
            **kwargs: Additional context variables for the prompt.

        Returns:
            Tuple[Union[Dict[str, Any], str], str]:
                - The parsed dictionary/string from inside the XML tag.
                - The full raw text generated by the LLM.
        """

        formatted_cands = [
            f"<candidate_{i}>\n{json.dumps(c.model_dump(exclude_none=True), indent=2)}\n</candidate_{i}>"
            for i, c in enumerate(candidates, start=1)
        ]
        return self._build_and_run(
            model=model,
            xml_tag=xml_tag,
            default_tag="selection",
            prompt_template=prompt_template,
            default_template=DEF_FB_PROMPTS.select,
            max_retries=max_retries,
            description=description,
            ctx_kwargs=kwargs,
            candidates="\n\n".join(formatted_cands),
        )



[docs]
    @require_llm
    def llm_evaluate_plan(
        self,
        model: BaseLLM,
        plan: Union[str, PlanningResult],
        domain: Union[str, DomainDetails],
        problem: Union[str, ProblemDetails],
        xml_tag: Optional[str] = None,
        prompt_template: Optional[str] = None,
        description: Optional[str] = None,
        max_retries: int = 3,
        **kwargs: Any,
    ) -> Tuple[Union[Dict[str, Any], str], str]:
        """
        Evaluates a valid plan to ensure it aligns with human semantic intent.

        Even if an external planner finds a mathematically valid plan, it might exploit
        loopholes. This method checks if the plan's behavior makes real-world sense.

        Args:
            model (BaseLLM): The LLM engine to use.
            plan (Union[str, PlanningResult]): The successful plan to evaluate.
            domain (Union[str, DomainDetails]): The domain string or Pydantic model.
            problem (Union[str, ProblemDetails]): The problem string or Pydantic model.
            xml_tag (Optional[str]): The XML tag expected in the LLM response.
            prompt_template (Optional[str]): Custom prompt template string.
            description (Optional[str]): Original NL description of the task.
            max_retries (int): Maximum attempts to query the LLM.
            **kwargs: Additional context variables for the prompt.

        Returns:
            Tuple[Union[Dict[str, Any], str], str]:
                - The parsed dictionary/string from inside the XML tag.
                - The full raw text generated by the LLM.
        """
        return self._build_and_run(
            model=model,
            xml_tag=xml_tag,
            default_tag="plan_evaluation",
            prompt_template=prompt_template,
            default_template=DEF_FB_PROMPTS.plan_evaluate,
            max_retries=max_retries,
            description=description,
            ctx_kwargs=kwargs,
            domain=(
                DomainBuilder.generate_domain(domain)
                if isinstance(domain, DomainDetails)
                else domain
            ),
            problem=(
                ProblemBuilder.generate_problem(problem)
                if isinstance(problem, ProblemDetails)
                else problem
            ),
            plan=(
                format_plan(plan_list=plan.plan)
                if isinstance(plan, PlanningResult)
                else plan
            ),
        )



[docs]
    @require_llm
    def llm_diagnose_plan(
        self,
        model: BaseLLM,
        domain: Union[str, DomainDetails],
        problem: Union[str, ProblemDetails],
        plan_error: Union[str, PlanningResult],
        xml_tag: Optional[str] = None,
        prompt_template: Optional[str] = None,
        description: Optional[str] = None,
        max_retries: int = 3,
        **kwargs: Any,
    ) -> Tuple[Union[Dict[str, Any], str], str]:
        """
        Diagnoses why an external planner failed to solve a domain/problem pair.

        The LLM analyzes planner tracebacks (e.g., 'Unsolvable', 'Timeout') against
        the PDDL code to determine where the logical bottleneck exists.

        Args:
            model (BaseLLM): The LLM engine to use.
            domain (Union[str, DomainDetails]): The domain string or Pydantic model.
            problem (Union[str, ProblemDetails]): The problem string or Pydantic model.
            plan_error (Union[str, PlanningResult]): The crash trace or failed PlanningResult.
            xml_tag (Optional[str]): The XML tag expected in the LLM response.
            prompt_template (Optional[str]): Custom prompt template string.
            description (Optional[str]): Original NL description of the task.
            max_retries (int): Maximum attempts to query the LLM.
            **kwargs: Additional context variables for the prompt.

        Returns:
            Tuple[Union[Dict[str, Any], str], str]:
                - The parsed dictionary/string from inside the XML tag.
                - The full raw text generated by the LLM.
        """
        return self._build_and_run(
            model=model,
            xml_tag=xml_tag,
            default_tag="plan_diagnosis",
            prompt_template=prompt_template,
            default_template=DEF_FB_PROMPTS.plan_diagnosis,
            max_retries=max_retries,
            description=description,
            ctx_kwargs=kwargs,
            domain=(
                DomainBuilder.generate_domain(domain)
                if isinstance(domain, DomainDetails)
                else domain
            ),
            problem=(
                ProblemBuilder.generate_problem(problem)
                if isinstance(problem, ProblemDetails)
                else problem
            ),
            error=(
                plan_error.error_message
                if isinstance(plan_error, PlanningResult)
                else plan_error
            ),
        )