Source code for data_juicer.tools.quality_classifier.eval

# This tool is used for evaluating a quality classifier on your own datasets
# based on PySpark.
#
# We provide several trained models for you. Please refer to the comments at
# the beginning of predict tool for more details.
#
# This tool needs several arguments:
#   - positive_datasets: the paths to the positive datasets. It could be a
#       string for a single dataset, e.g. 'pos.parquet', or a list of strings
#       for several datasets, e.g. '["pos1.parquet", "pos2.parquet"]'.
#   - negative_datasets: the paths to the negative datasets. It could be a
#       string for a single dataset, e.g. 'neg.parquet', or a list of strings
#       for several datasets, e.g. '["neg1.parquet", "neg2.parquet"]'.
#   - model: quality classifier name to apply. It's "gpt3" in default. You can
#       use one of ["gpt3", "chinese", "code"] we provided, or you can set it
#       to the path to your own model trained using the train.py tool.
#   - tokenizer: what tokenizer to use to tokenize texts. It's None in default,
#       which means using the standard Tokenizer of PySpark. You can use one of
#       ["zh.sp.model", "code.sp.model"] we provided, or you can set it to the
#       path to your own sentencepiece model.
#   - text_key: the field key name to hold texts to be classified. It's "text"
#       in default.

import fire
from loguru import logger

from data_juicer.tools.quality_classifier.qc_utils import (
    eval,
    init_spark,
    load_datasets,
)



[docs]
@logger.catch(reraise=True)
def main(positive_datasets=None, negative_datasets=None, model="my_quality_model", tokenizer=None, text_key="text"):
    """
    Evaluate a trained quality classifier using specific positive/negative
    datasets
    :param positive_datasets: the paths to the positive datasets. It could be a
        string for a single dataset, e.g. 'pos.parquet', or a list of strings
        for multiple datasets, e.g. '["pos1.parquet", "pos2.parquet"]'
    :param negative_datasets: the paths to the negative datasets. It could be a
        string for a single dataset, e.g. 'neg.parquet', or a list of strings
        for multiple datasets, e.g. '["neg1.parquet", "neg2.parquet"]'
    :param model: quality classifier name to apply. It's "my_quality_model" in
        default. You can use one of ["gpt3", "chinese", "code"] we provided, or
        you can set it to the path to your own model trained using the train.py
        tool
    :param tokenizer: what tokenizer to use to tokenize texts. It's None in
        default, which means using the standard Tokenizer of PySpark. You can
        use one of ["zh.sp.model", "code.sp.model"] we provided, or you can set
        it to the path to your own sentencepiece model
    :param text_key: the field key name to hold texts to be classified. It's
        "text" in default
    :return:
    """
    # convert a single dataset to a dataset list
    if positive_datasets is None:
        positive_datasets = []
    if negative_datasets is None:
        negative_datasets = []
    if isinstance(positive_datasets, str):
        positive_datasets = [positive_datasets]
    if isinstance(negative_datasets, str):
        negative_datasets = [negative_datasets]

    # initialize a spark session
    spark = init_spark()

    # load positive and negative datasets
    pos = load_datasets(spark, positive_datasets, text_key=text_key, label=1, only_text=True)
    neg = load_datasets(spark, negative_datasets, text_key=text_key, label=0, only_text=True)

    # merge positive and negative datasets
    if pos is not None and neg is not None:
        ds = pos.unionAll(neg)
    elif pos is not None:
        ds = pos
    elif neg is not None:
        ds = neg
    else:
        logger.error("Empty dataset.")
        exit(0)

    # start evaluation
    logger.info(f"Number of samples: {ds.count()}")
    eval(model, ds, tokenizer)



if __name__ == "__main__":
    fire.Fire(main)