Source code for data_juicer.ops.mapper.dialog_llm_input_utils

# Copyright 2025 The Data-Juicer Authors. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Helpers for dialog LLM mappers (intent / topic / sentiment / intensity)."""

from __future__ import annotations

from typing import List, Tuple



[docs]
def build_dialog_turns_for_prompt(
    sample: dict,
    *,
    history_key: str,
    query_key: str,
    response_key: str,
) -> List[Tuple[str, str]]:
    """Build (user, assistant) turns for dialog LLM mappers.

    Does not mutate ``sample``. Merge rules match
    ``dialog_quality_llm_utils._normalize_dialog_tail``: after normalize, the last
    turn lives in both ``dialog_history[-1]`` and ``query``/``response``, so those
    fields must not be appended again (would duplicate the final exchange; older
    code that mutated ``dialog_history`` in place corrupted downstream rows).
    """
    dialog: List[Tuple[str, str]] = []
    raw = sample.get(history_key)
    if isinstance(raw, list):
        for turn in raw:
            if isinstance(turn, (list, tuple)) and len(turn) >= 2:
                u0 = "" if turn[0] is None else str(turn[0])
                u1 = "" if turn[1] is None else str(turn[1])
                dialog.append((u0, u1))
    if sample.get(query_key):
        q = sample[query_key]
        r = sample.get(response_key) or ""
        qs = "" if q is None else str(q)
        rs = "" if r is None else str(r)
        if not dialog:
            dialog.append((qs, rs))
        else:
            lu, la = dialog[-1]
            if lu == qs and la == rs:
                pass
            elif lu == qs:
                dialog[-1] = (qs, rs)
            else:
                dialog.append((qs, rs))
    return dialog




[docs]
def clip_text_for_dialog_prompt(
    text: str,
    max_chars: int,
    note: str = "truncated",
) -> str:
    """Truncate long ``text`` for API prompts when ``max_chars`` > 0.

    Agent traces often concatenate tool outputs into ``response``; formatter
    limits elsewhere do not apply to these mappers' ``history_key`` payloads.
    """
    if max_chars is None or max_chars <= 0:
        return text
    if not text:
        return text
    if len(text) <= max_chars:
        return text
    suffix = f"\n…[{note}]…"
    take = max_chars - len(suffix)
    if take <= 0:
        return suffix.strip()
    return text[:take] + suffix




[docs]
def clip_query_response_pair(
    q: object,
    r: object,
    max_query_chars: int,
    max_response_chars: int,
) -> Tuple[str, str]:
    qs = "" if q is None else str(q)
    rs = "" if r is None else str(r)
    return (
        clip_text_for_dialog_prompt(qs, max_query_chars, "query truncated"),
        clip_text_for_dialog_prompt(
            rs,
            max_response_chars,
            "response truncated",
        ),
    )