data_juicer.ops.mapper.chinese_convert_mapper 源代码
from data_juicer.utils.lazy_loader import LazyLoader
from ..base_op import OPERATORS, Mapper
opencc = LazyLoader("opencc")
OP_NAME = "chinese_convert_mapper"
OPENCC_CONVERTER = None
[文档]
def prepare_converter(mode):
mode_path = mode + ".json"
global OPENCC_CONVERTER
if OPENCC_CONVERTER is None:
# empty converter
OPENCC_CONVERTER = opencc.OpenCC(mode_path)
if not OPENCC_CONVERTER.config.endswith(mode_path):
# the config is actually a config path
# update and get a new converter with specified mode
OPENCC_CONVERTER = opencc.OpenCC(mode_path)
[文档]
@OPERATORS.register_module(OP_NAME)
class ChineseConvertMapper(Mapper):
"""Mapper to convert Chinese between Traditional Chinese, Simplified Chinese
and Japanese Kanji."""
_batched_op = True
[文档]
def __init__(self, mode: str = "s2t", *args, **kwargs):
"""
Initialization method.
:param mode: Choose the mode to convert Chinese:
s2t: Simplified Chinese to Traditional Chinese,
t2s: Traditional Chinese to Simplified Chinese,
s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard),
tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese,
s2hk: Simplified Chinese to Traditional Chinese
(Hong Kong variant),
hk2s: Traditional Chinese (Hong Kong variant) to Simplified
Chinese,
s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard)
with Taiwanese idiom,
tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese
with Mainland Chinese idiom,
t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard),
tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese,
hk2t: Traditional Chinese (Hong Kong variant) to Traditional
Chinese,
t2hk: Traditional Chinese to Traditional Chinese
(Hong Kong variant),
t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese
Kanji,
jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese
Characters,
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
mode_list = [
"s2t",
"t2s",
"s2tw",
"tw2s",
"s2hk",
"hk2s",
"s2twp",
"tw2sp",
"t2tw",
"tw2t",
"hk2t",
"t2hk",
"t2jp",
"jp2t",
]
assert mode in mode_list, "Please make sure mode is one of {}".format(mode_list)
self.mode = mode
prepare_converter(self.mode)
[文档]
def process_batched(self, samples):
prepare_converter(self.mode)
samples[self.text_key] = [OPENCC_CONVERTER.convert(text) for text in samples[self.text_key]]
return samples