data_juicer_agents.core.tool#

Core runtime-agnostic tool contracts and registry.

class data_juicer_agents.core.tool.DatasetSource(*, path: str = '', config: Dict[str, Any] | None = None, generated: Dict[str, Any] | None = None)[源代码]#

基类:BaseModel

Unified dataset source envelope.

Exactly one of path, config, or generated must be provided. Providing zero or more than one raises a validation error.

Examples:

# Simple local file (shortcut)
DatasetSource(path="/data/train.jsonl")

# Structured load config (remote, multi-source, max_sample_num …)
DatasetSource(config={
    "configs": [
        {"type": "local", "path": "/data/a.jsonl", "weight": 0.7},
        {"type": "local", "path": "/data/b.jsonl", "weight": 0.3},
    ],
    "max_sample_num": 50000,
})

# Dynamic generation via Data-Juicer FORMATTERS
DatasetSource(generated={"type": "text_formatter", ...})
path: str#
config: Dict[str, Any] | None#
generated: Dict[str, Any] | None#
to_legacy_args() Dict[str, Any][源代码]#

Convert to the legacy (dataset_path, dataset, generated_dataset_config) dict.

Returns a dict with exactly the three legacy keys so callers can unpack with **source.to_legacy_args().

classmethod from_legacy(dataset_path: str = '', dataset: Dict[str, Any] | None = None, generated_dataset_config: Dict[str, Any] | None = None) DatasetSource[源代码]#

Create a DatasetSource from the legacy triple.

This is the primary migration bridge: CLI argument parsers and existing callers can keep their three-parameter interface and convert to the unified envelope at the boundary.

model_config = {}#

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class data_juicer_agents.core.tool.ToolArtifact(path: str, description: str = '', kind: str = 'file', label: str = '')[源代码]#

基类:object

Named artifact produced by a tool.

path: str#
description: str = ''#
kind: str = 'file'#
label: str = ''#
to_dict() Dict[str, Any][源代码]#
__init__(path: str, description: str = '', kind: str = 'file', label: str = '') None#
class data_juicer_agents.core.tool.ToolContext(working_dir: str = './.djx', env: Dict[str, str]=<factory>, artifacts_dir: str | None = None, runtime_values: Dict[str, ~typing.Any]=<factory>)[源代码]#

基类:object

Execution context shared by all tool runtimes.

working_dir: str = './.djx'#
env: Dict[str, str]#
artifacts_dir: str | None = None#
runtime_values: Dict[str, Any]#
resolve_artifacts_dir() Path[源代码]#
__init__(working_dir: str = './.djx', env: Dict[str, str]=<factory>, artifacts_dir: str | None = None, runtime_values: Dict[str, ~typing.Any]=<factory>) None#
class data_juicer_agents.core.tool.ToolRegistry(_tools: Dict[str, ~data_juicer_agents.core.tool.contracts.ToolSpec]=<factory>)[源代码]#

基类:object

Container of tool definitions.

register(spec: ToolSpec) None[源代码]#
get(name: str) ToolSpec[源代码]#
list(*, tags: Sequence[str] | None = None) List[ToolSpec][源代码]#
list_tools(*, tags: Sequence[str] | None = None) List[ToolSpec][源代码]#
names() List[str][源代码]#
__init__(_tools: Dict[str, ~data_juicer_agents.core.tool.contracts.ToolSpec]=<factory>) None#
class data_juicer_agents.core.tool.ToolResult(ok: bool, summary: str = '', data: Dict[str, ~typing.Any]=<factory>, artifacts: List[ToolArtifact] = <factory>, error_type: str = '', error_message: str = '', next_actions: List[str] = <factory>)[源代码]#

基类:object

Normalized tool execution result.

ok: bool#
summary: str = ''#
data: Dict[str, Any]#
artifacts: List[ToolArtifact]#
error_type: str = ''#
error_message: str = ''#
next_actions: List[str]#
classmethod success(*, summary: str = '', data: Dict[str, Any] | None = None, artifacts: Iterable[ToolArtifact] | None = None) ToolResult[源代码]#
classmethod failure(*, summary: str, error_type: str, error_message: str = '', data: Dict[str, Any] | None = None, next_actions: Iterable[str] | None = None) ToolResult[源代码]#
to_payload(*, action: str | None = None) Dict[str, Any][源代码]#
__init__(ok: bool, summary: str = '', data: Dict[str, ~typing.Any]=<factory>, artifacts: List[ToolArtifact] = <factory>, error_type: str = '', error_message: str = '', next_actions: List[str] = <factory>) None#
class data_juicer_agents.core.tool.ToolSpec(name: str, description: str, input_model: Type[BaseModel], output_model: Type[BaseModel] | None, executor: Callable[[ToolContext, BaseModel], ToolResult], tags: Tuple[str, ...] = (), effects: Literal['read', 'write', 'execute', 'external'] = 'read', confirmation: Literal['none', 'recommended', 'required'] = 'none')[源代码]#

基类:object

Definition of one atomic tool.

name: str#
description: str#
input_model: Type[BaseModel]#
output_model: Type[BaseModel] | None#
executor: Callable[[ToolContext, BaseModel], ToolResult]#
tags: Tuple[str, ...] = ()#
effects: Literal['read', 'write', 'execute', 'external'] = 'read'#
confirmation: Literal['none', 'recommended', 'required'] = 'none'#
execute(ctx: ToolContext, raw_input: BaseModel | Dict[str, Any]) ToolResult[源代码]#
__init__(name: str, description: str, input_model: Type[BaseModel], output_model: Type[BaseModel] | None, executor: Callable[[ToolContext, BaseModel], ToolResult], tags: Tuple[str, ...] = (), effects: Literal['read', 'write', 'execute', 'external'] = 'read', confirmation: Literal['none', 'recommended', 'required'] = 'none') None#
data_juicer_agents.core.tool.build_default_tool_registry(*, profile: str | None = None, groups: Sequence[str] | None = None) ToolRegistry[源代码]#
data_juicer_agents.core.tool.get_active_tool_profile() str[源代码]#
data_juicer_agents.core.tool.get_tool_spec(name: str, *, profile: str | None = None) ToolSpec[源代码]#
data_juicer_agents.core.tool.groups_for_tool_profile(profile: str | None) Tuple[str, ...] | None[源代码]#
data_juicer_agents.core.tool.list_tool_specs(*, tags: Sequence[str] | None = None, profile: str | None = None) List[ToolSpec][源代码]#
data_juicer_agents.core.tool.normalize_tool_profile(profile: str | None) str[源代码]#
data_juicer_agents.core.tool.tool_is_excluded_from_profile(tool_name: str, profile: str | None) bool[源代码]#