代码可用的所有模块
- data_juicer.analysis.column_wise_analysis
- data_juicer.analysis.correlation_analysis
- data_juicer.analysis.diversity_analysis
- data_juicer.analysis.measure
- data_juicer.analysis.overall_analysis
- data_juicer.config.config
- data_juicer.core.monitor
- data_juicer.download.downloader
- data_juicer.download.wikipedia
- data_juicer.format.csv_formatter
- data_juicer.format.empty_formatter
- data_juicer.format.formatter
- data_juicer.format.json_formatter
- data_juicer.format.load
- data_juicer.format.parquet_formatter
- data_juicer.format.text_formatter
- data_juicer.format.tsv_formatter
- data_juicer.ops.aggregator.entity_attribute_aggregator
- data_juicer.ops.aggregator.meta_tags_aggregator
- data_juicer.ops.aggregator.most_relevant_entities_aggregator
- data_juicer.ops.aggregator.nested_aggregator
- data_juicer.ops.base_op
- data_juicer.ops.common.dwpose_func
- data_juicer.ops.common.helper_func
- data_juicer.ops.common.prompt2prompt_pipeline
- data_juicer.ops.deduplicator.document_deduplicator
- data_juicer.ops.deduplicator.document_minhash_deduplicator
- data_juicer.ops.deduplicator.document_simhash_deduplicator
- data_juicer.ops.deduplicator.image_deduplicator
- data_juicer.ops.deduplicator.ray_basic_deduplicator
- data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator
- data_juicer.ops.deduplicator.ray_document_deduplicator
- data_juicer.ops.deduplicator.ray_image_deduplicator
- data_juicer.ops.deduplicator.ray_video_deduplicator
- data_juicer.ops.deduplicator.video_deduplicator
- data_juicer.ops.filter.alphanumeric_filter
- data_juicer.ops.filter.audio_duration_filter
- data_juicer.ops.load
- data_juicer.ops.op_fusion
- data_juicer.tools.mcp_server
- data_juicer.tools.quality_classifier.eval
- data_juicer.tools.quality_classifier.predict
- data_juicer.tools.quality_classifier.qc_utils
- data_juicer.tools.quality_classifier.train
- data_juicer.utils.asset_utils
- data_juicer.utils.cache_utils
- data_juicer.utils.ckpt_utils
- data_juicer.utils.common_utils
- data_juicer.utils.compress
- data_juicer.utils.constant
- data_juicer.utils.file_utils
- data_juicer.utils.fingerprint_utils
- data_juicer.utils.lazy_loader
- data_juicer.utils.logger_utils
- data_juicer.utils.mm_utils
- data_juicer.utils.model_utils
- data_juicer.utils.nltk_utils
- data_juicer.utils.process_utils
- data_juicer.utils.ray_utils
- data_juicer.utils.registry
- data_juicer.utils.resource_utils
- data_juicer.utils.s3_utils
- data_juicer.utils.sample
- data_juicer.utils.video_utils
- data_juicer.utils.webdataset_utils