Source code for data_juicer.tools.hpo.objects
import os
import shutil
from data_juicer.core import DefaultExecutor
from data_juicer.tools.quality_classifier.predict import predict_score
[docs]
def get_hpo_objective(obj_name):
if obj_name == "quality_score":
return obj_quality_score
# elif obj_name == "model_loss":
# return obj_model_loss
# elif obj_name == "downstream_task":
# return obj_downstream_task
# elif obj_name == "synergy_metric":
# return obj_synergy_metric
else:
raise NotImplementedError(f"unsupported objective type in HPO: {obj_name}. " f"Please implement it first.")
[docs]
def obj_quality_score(dj_cfg):
"""
HPO loop: cfg --> data --> data score --> cfg --> data --> ...
:param dj_cfg: specified data recipe (as a search point)
:return: a data score, after
1. processing data according to the dj_cfg;
2. applying a quality classifier
"""
if dj_cfg.executor_type == "default":
executor = DefaultExecutor(dj_cfg)
elif dj_cfg.executor_type == "ray":
from data_juicer.core.executor.ray_executor import RayExecutor
executor = RayExecutor(dj_cfg)
else:
raise NotImplementedError(
f"unsupported executor_type: {dj_cfg.executor_type}, " f"expected in [`default`, `ray`]",
)
executor.run()
# calculate and aggregate data score
# feel free to customize the quality scorer, via the following args
# [--model <model_path>] \
# [--tokenizer <tokenizer_type>] \
# [--keep_method <keep_method>] \
# [--text_key <text_key>] \
tmp_res_export_path = dj_cfg.export_path + ".tmp_hpo.jsonl"
if os.path.exists(tmp_res_export_path):
if os.path.isfile(tmp_res_export_path):
os.remove(tmp_res_export_path)
if os.path.isdir(tmp_res_export_path):
shutil.rmtree(tmp_res_export_path)
overall_quality_stats = predict_score(dj_cfg.export_path, tmp_res_export_path, overall_stats=True)
# by default, using the mean quality score of processed data as final score
return overall_quality_stats.loc["mean"]