Source code for data_juicer.utils.datasets_json_compat
"""
HuggingFace ``datasets`` parses JSON/JSON lines via pandas, which may call
``ujson``. UltraJSON rejects some values that CPython's ``json`` accepts,
notably very large integers, raising ``ValueError: Value is too big!``.
Set environment variable ``DATA_JUICER_USE_STDLIB_JSON=1`` (or ``true`` /
``yes`` / ``on``) before running ``dj-process`` (or any code path that calls
``init_configs``) to force the datasets stack to use ``json.loads`` instead.
"""
from __future__ import annotations
import importlib
import json
import os
from typing import Any, Union
from loguru import logger
_ENV_FLAG = "DATA_JUICER_USE_STDLIB_JSON"
_PATCHED = False
def _truthy_env(name: str) -> bool:
return os.environ.get(name, "").strip().lower() in ("1", "true", "yes", "on")
[docs]
def apply_stdlib_json_patch_for_datasets() -> bool:
"""
If ``DATA_JUICER_USE_STDLIB_JSON`` is enabled, replace
``datasets.utils.json.ujson_loads`` with ``json.loads`` (bytes-safe).
.. note::
We patch both ``datasets.utils.json`` and
``datasets.packaged_modules.json.json`` because the latter imports
``ujson_loads`` at module load time (``from ... import ujson_loads``),
which binds the function object directly. Modifying the source module's
attribute does not affect already-bound references.
:return: whether the patch was applied in this process.
"""
global _PATCHED
if _PATCHED:
return True
if not _truthy_env(_ENV_FLAG):
return False
try:
import datasets.utils.json as ds_json
except ImportError:
logger.warning(f"{_ENV_FLAG} is set but `datasets` is not installed; skipping JSON patch.")
return False
if not hasattr(ds_json, "ujson_loads"):
logger.warning(
f"{_ENV_FLAG} is set but `datasets.utils.json` has no ujson_loads; "
"skipping JSON patch (your datasets version may differ)."
)
return False
def _stdlib_loads(data: Union[str, bytes, bytearray], *_args: Any, **kwargs: Any) -> Any:
# ``json.loads`` does not accept ujson-only kwargs; ignore extras for compatibility.
kwargs.pop("precise_float", None)
if isinstance(data, (bytes, bytearray)):
data = data.decode("utf-8")
return json.loads(data)
# Patch the source module
ds_json.ujson_loads = _stdlib_loads # type: ignore[assignment]
# Also patch the module that actually uses ujson_loads in load_dataset().
# datasets.packaged_modules.json.json imports ujson_loads at module load time:
# from datasets.utils.json import ujson_loads
# This binds the function object directly, so modifying ds_json.ujson_loads
# does not affect the already-bound reference in that module.
# Note: we use importlib because the module name 'json' conflicts with stdlib.
try:
ds_json_loader = importlib.import_module("datasets.packaged_modules.json.json")
if hasattr(ds_json_loader, "ujson_loads"):
ds_json_loader.ujson_loads = _stdlib_loads
except ImportError:
pass # Older datasets versions may not have this module structure
_PATCHED = True
logger.info(
f"Applied datasets JSON workaround: {_ENV_FLAG}=1 " "(using stdlib json instead of ujson for JSONL parsing)."
)
return True