Source code for data_juicer.tools.hpo.objects

import os
import shutil

from data_juicer.core import DefaultExecutor
from data_juicer.tools.quality_classifier.predict import predict_score


[docs] def get_hpo_objective(obj_name): if obj_name == "quality_score": return obj_quality_score # elif obj_name == "model_loss": # return obj_model_loss # elif obj_name == "downstream_task": # return obj_downstream_task # elif obj_name == "synergy_metric": # return obj_synergy_metric else: raise NotImplementedError(f"unsupported objective type in HPO: {obj_name}. " f"Please implement it first.")
[docs] def obj_quality_score(dj_cfg): """ HPO loop: cfg --> data --> data score --> cfg --> data --> ... :param dj_cfg: specified data recipe (as a search point) :return: a data score, after 1. processing data according to the dj_cfg; 2. applying a quality classifier """ if dj_cfg.executor_type == "default": executor = DefaultExecutor(dj_cfg) elif dj_cfg.executor_type == "ray": from data_juicer.core.executor.ray_executor import RayExecutor executor = RayExecutor(dj_cfg) else: raise NotImplementedError( f"unsupported executor_type: {dj_cfg.executor_type}, " f"expected in [`default`, `ray`]", ) executor.run() # calculate and aggregate data score # feel free to customize the quality scorer, via the following args # [--model <model_path>] \ # [--tokenizer <tokenizer_type>] \ # [--keep_method <keep_method>] \ # [--text_key <text_key>] \ tmp_res_export_path = dj_cfg.export_path + ".tmp_hpo.jsonl" if os.path.exists(tmp_res_export_path): if os.path.isfile(tmp_res_export_path): os.remove(tmp_res_export_path) if os.path.isdir(tmp_res_export_path): shutil.rmtree(tmp_res_export_path) overall_quality_stats = predict_score(dj_cfg.export_path, tmp_res_export_path, overall_stats=True) # by default, using the mean quality score of processed data as final score return overall_quality_stats.loc["mean"]