Source code for data_juicer_agents.tools.plan.build_dataset_spec.input
import json
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, ConfigDict, Field, field_validator
from data_juicer_agents.core.tool import DatasetSource
[docs]
class BuildDatasetSpecInput(BaseModel):
model_config = ConfigDict(extra="allow") # Allow advanced dataset fields as extra kwargs
intent: str = Field(
description=(
"User intent for the current planning task. "
"For advanced dataset options (e.g., export_type, export_shard_size, "
"export_in_parallel, load_dataset_kwargs, suffixes, image_special_token, etc.), "
"call list_dataset_fields first to discover available fields, "
"then pass them directly as additional arguments to this tool."
)
)
export_path: str = Field(description="Output dataset path.")
dataset_source: DatasetSource = Field(
description=(
"Dataset source specification. Provide exactly one of: "
"path (local file/directory shortcut), "
"config (structured load config for remote sources, multi-source mixing, "
"max_sample_num, per-source weights — call list_dataset_load_strategies to "
"discover available types/sources), "
"or generated (dynamic formatter config — call list_dataset_formatters to "
"discover available formatters and parameters)."
),
)
dataset_profile: Dict[str, Any] = Field(
default_factory=dict,
description="Dataset inspection payload returned by inspect_dataset.",
)
modality_hint: str = Field(default="", description="Optional explicit modality override.")
text_keys_hint: List[str] = Field(default_factory=list, description="Optional text key overrides.")
image_key_hint: str = Field(default="", description="Optional image key override.")
audio_key_hint: str = Field(default="", description="Optional audio key override.")
video_key_hint: str = Field(default="", description="Optional video key override.")
image_bytes_key_hint: str = Field(default="", description="Optional image-bytes key override.")