Source code for data_juicer.ops.mapper.clean_copyright_mapper
# Some code here has been modified from:
# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
# --------------------------------------------------------
import regex as re
from ..base_op import OPERATORS, Mapper
[docs]
@OPERATORS.register_module("clean_copyright_mapper")
class CleanCopyrightMapper(Mapper):
"""Mapper to clean copyright comments at the beginning of the text
samples."""
_batched_op = True
[docs]
def __init__(self, *args, **kwargs):
"""
Initialization method.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.pat = re.compile("/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/")
self.cpat = re.compile("copyright", re.IGNORECASE)
def _process_single_sample(self, sample):
r = self.pat.search(sample)
if r:
# found one, now see if it contains "copyright", if so strip it
span = r.span()
sub = sample[span[0] : span[1]]
if self.cpat.search(sub):
# cut it
sample = sample[: span[0]] + sample[span[1] :]
return sample
lines = sample.split("\n")
skip = 0
# Greedy replace any file that begins with comment block, most
# are copyright headers
for k in range(len(lines)):
if lines[k].startswith("//") or lines[k].startswith("#") or lines[k].startswith("--") or not lines[k]:
skip = skip + 1
else:
break
if skip:
# we skipped, consume it
sample = "\n".join(lines[skip:])
return sample
[docs]
def process_batched(self, samples):
samples[self.text_key] = [self._process_single_sample(text) for text in samples[self.text_key]]
return samples