Source code for data_juicer.ops.mapper.remove_specific_chars_mapper
from typing import List, Union
import regex as re
from ..base_op import OPERATORS, Mapper
[docs]
@OPERATORS.register_module("remove_specific_chars_mapper")
class RemoveSpecificCharsMapper(Mapper):
"""Mapper to clean specific chars in text samples."""
_batched_op = True
[docs]
def __init__(self, chars_to_remove: Union[str, List[str]] = "◆●■►▼▲▴∆▻▷❖♡□", *args, **kwargs):
"""
Initialization method.
:param chars_to_remove: a list or a string including all
characters that need to be removed from text.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
if chars_to_remove:
self.pattern = "[" + "|".join(chars_to_remove) + "]"
else:
self.pattern = None
[docs]
def process_batched(self, samples):
if self.pattern is None:
return samples
samples[self.text_key] = [
re.sub(pattern=self.pattern, repl=r"", string=text, flags=re.DOTALL) for text in samples[self.text_key]
]
return samples