data_juicer.utils.datasets_json_compat 源代码

"""
HuggingFace ``datasets`` parses JSON/JSON lines via pandas, which may call
``ujson``. UltraJSON rejects some values that CPython's ``json`` accepts,
notably very large integers, raising ``ValueError: Value is too big!``.

Set environment variable ``DATA_JUICER_USE_STDLIB_JSON=1`` (or ``true`` /
``yes`` / ``on``) before running ``dj-process`` (or any code path that calls
``init_configs``) to force the datasets stack to use ``json.loads`` instead.
"""

from __future__ import annotations

import importlib
import json
import os
from typing import Any, Union

from loguru import logger

_ENV_FLAG = "DATA_JUICER_USE_STDLIB_JSON"
_PATCHED = False


def _truthy_env(name: str) -> bool:
    return os.environ.get(name, "").strip().lower() in ("1", "true", "yes", "on")


[文档] def apply_stdlib_json_patch_for_datasets() -> bool: """ If ``DATA_JUICER_USE_STDLIB_JSON`` is enabled, replace ``datasets.utils.json.ujson_loads`` with ``json.loads`` (bytes-safe). .. note:: We patch both ``datasets.utils.json`` and ``datasets.packaged_modules.json.json`` because the latter imports ``ujson_loads`` at module load time (``from ... import ujson_loads``), which binds the function object directly. Modifying the source module's attribute does not affect already-bound references. :return: whether the patch was applied in this process. """ global _PATCHED if _PATCHED: return True if not _truthy_env(_ENV_FLAG): return False try: import datasets.utils.json as ds_json except ImportError: logger.warning(f"{_ENV_FLAG} is set but `datasets` is not installed; skipping JSON patch.") return False if not hasattr(ds_json, "ujson_loads"): logger.warning( f"{_ENV_FLAG} is set but `datasets.utils.json` has no ujson_loads; " "skipping JSON patch (your datasets version may differ)." ) return False def _stdlib_loads(data: Union[str, bytes, bytearray], *_args: Any, **kwargs: Any) -> Any: # ``json.loads`` does not accept ujson-only kwargs; ignore extras for compatibility. kwargs.pop("precise_float", None) if isinstance(data, (bytes, bytearray)): data = data.decode("utf-8") return json.loads(data) # Patch the source module ds_json.ujson_loads = _stdlib_loads # type: ignore[assignment] # Also patch the module that actually uses ujson_loads in load_dataset(). # datasets.packaged_modules.json.json imports ujson_loads at module load time: # from datasets.utils.json import ujson_loads # This binds the function object directly, so modifying ds_json.ujson_loads # does not affect the already-bound reference in that module. # Note: we use importlib because the module name 'json' conflicts with stdlib. try: ds_json_loader = importlib.import_module("datasets.packaged_modules.json.json") if hasattr(ds_json_loader, "ujson_loads"): ds_json_loader.ujson_loads = _stdlib_loads except ImportError: pass # Older datasets versions may not have this module structure _PATCHED = True logger.info( f"Applied datasets JSON workaround: {_ENV_FLAG}=1 " "(using stdlib json instead of ujson for JSONL parsing)." ) return True