data_juicer_sandbox.utils 源代码
import os
from typing import List
[文档]
def validate_hook_output(pipelines, output_key):
"""
Validate whether a specified hook output is valid
This function parses the output_key and searches for the corresponding hook
within the given pipeline list, then validates whether the hook contains
the specified output key.
:param pipelines: A list of pipeline objects, each containing a name attribute
and job lists including probe_jobs, refine_recipe_jobs, execution_jobs, and evaluation_jobs
:param output_key: The output key string with format "pipeline_name.hook_meta_name.output_name"
:return: True if the corresponding pipeline, hook and output key are found and valid, otherwise False
"""
pipeline_name, hook_meta_name, output_name = output_key.split(".")
for pipeline in pipelines:
if pipeline.name == pipeline_name:
all_jobs = (
pipeline.probe_jobs + pipeline.refine_recipe_jobs + pipeline.execution_jobs + pipeline.evaluation_jobs
)
for hook in all_jobs:
if hook.meta_name == hook_meta_name:
if hook.output_keys is None or output_name in hook.output_keys:
return True
return False
[文档]
def guess_file_or_dir(path: str) -> str:
"""
Guess a path is a file or a directory.
If there is a "." in the basename of the path and the "." is not the first char, guess it's a file.
Otherwise, guess it's a directory.
"""
clean_path = path.rstrip("/\\")
basename = os.path.basename(clean_path)
if "." in basename and not basename.startswith("."):
return "file"
else:
return "dir"
[文档]
def add_iter_subdir_to_paths(paths: List[str], iter_num: int) -> List[str]:
"""
Add iteration number as a subdir to the specified paths.
Example:
1. files: "/a/b/c/d.jsonl" --> "/a/b/c/{iter_num}/d.jsonl"
2. dirs: "/a/b/c" --> "/a/b/c/{iter_num}"
:param paths: the input original paths
:param iter_num: the iteration number to be added to the paths
:return: the result paths with the same number as the original paths,
with iteration numbers are added as the examples show.
"""
res_paths = []
for path in paths:
if guess_file_or_dir(path) == "file":
res_paths.append(os.path.join(os.path.dirname(path), f"iter_{str(iter_num)}", os.path.basename(path)))
else:
res_paths.append(os.path.join(path, f"iter_{str(iter_num)}"))
return res_paths