Source code for data_juicer.ops.mapper.expand_macro_mapper
# Some code here has been modified from:
# https://github.com/togethercomputer/RedPajama-Data/blob/rp_v1/data_prep/arxiv/arxiv_cleaner.py
# --------------------------------------------------------
import regex as re
from ..base_op import OPERATORS, Mapper
[docs]
@OPERATORS.register_module("expand_macro_mapper")
class ExpandMacroMapper(Mapper):
"""Mapper to expand macro definitions in the document body of Latex
samples."""
_batched_op = True
[docs]
def __init__(self, *args, **kwargs):
"""
Initialization method.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
def _build_non_arg_macros_dict(self, file_content):
# regex for extracting \newcommand macros without arguments
non_arg_nc_reg = re.compile(
# this regex matches the following:
# \newcommand{\macro_name}{macro_value}
# \newcommand*{\macro_name}{macro_value}
# where macro_name is only allowed to contain letters and numbers;
# macro_value can contain any character.
pattern=r"\\\bnewcommand\b\*?\{(\\[a-zA-Z0-9]+?)\}\{(.*?)\}$",
flags=re.MULTILINE,
)
# regex for extracting \def macros without arguments
non_arg_def_reg = re.compile(
# this regex matches the following:
# \def\macro_name{macro_value}
# where macro_name is only allowed to contain letters and numbers;
# macro_value can contain any character.
pattern=r"\\def\s*(\\[a-zA-Z0-9]+?)\s*\{(.*?)\}$",
flags=re.MULTILINE,
)
# Extract all user-defined LaTeX macros from the preamble
macros = {}
for reg in [non_arg_nc_reg, non_arg_def_reg]:
for match in reg.finditer(file_content):
# convert the macro name and value to a raw string that can be
# used in re.sub
macro_name = match.group(1).encode("unicode-escape").decode("utf-8")
macro_val = match.group(2).encode("unicode-escape").decode("utf-8")
macros[macro_name] = macro_val
return macros
[docs]
def process_batched(self, samples):
for idx, text in enumerate(samples[self.text_key]):
non_arg_macros = self._build_non_arg_macros_dict(text)
# TODO: macros that take arguments are not supported yet
arg_macros = {}
# inline-expand all non-arg macros
for macro_name, macro_value in non_arg_macros.items():
text = re.sub(
# make pattern grouped to make sure that the macro
# is not part of a longer alphanumeric word
pattern=r"(" + macro_name + r")" + r"([^a-zA-Z0-9])",
# replace the macro with its value and add back the
# character that was matched after the macro
repl=macro_value + r"\2",
string=text,
)
# inline-expand all macros that use args
# TODO: inline-expand macros with args
for macro_name, macro_value in arg_macros.items():
pass
samples[self.text_key][idx] = text
return samples