Source code for data_juicer.ops.mapper.clean_copyright_mapper

# Some code here has been modified from:
# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
# --------------------------------------------------------

import regex as re

from ..base_op import OPERATORS, Mapper



[docs]
@OPERATORS.register_module("clean_copyright_mapper")
class CleanCopyrightMapper(Mapper):
    """Mapper to clean copyright comments at the beginning of the text
    samples."""

    _batched_op = True


[docs]
    def __init__(self, *args, **kwargs):
        """
        Initialization method.

        :param args: extra args
        :param kwargs: extra args
        """
        super().__init__(*args, **kwargs)
        self.pat = re.compile("/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/")
        self.cpat = re.compile("copyright", re.IGNORECASE)


    def _process_single_sample(self, sample):
        r = self.pat.search(sample)
        if r:
            # found one, now see if it contains "copyright", if so strip it
            span = r.span()
            sub = sample[span[0] : span[1]]
            if self.cpat.search(sub):
                # cut it
                sample = sample[: span[0]] + sample[span[1] :]

            return sample

        lines = sample.split("\n")
        skip = 0

        # Greedy replace any file that begins with comment block, most
        # are copyright headers
        for k in range(len(lines)):
            if lines[k].startswith("//") or lines[k].startswith("#") or lines[k].startswith("--") or not lines[k]:
                skip = skip + 1
            else:
                break

        if skip:
            # we skipped, consume it
            sample = "\n".join(lines[skip:])
        return sample


[docs]
    def process_batched(self, samples):
        samples[self.text_key] = [self._process_single_sample(text) for text in samples[self.text_key]]
        return samples