All modules for which code is available
- data_juicer.analysis.collector
- data_juicer.analysis.column_wise_analysis
- data_juicer.analysis.correlation_analysis
- data_juicer.analysis.diversity_analysis
- data_juicer.analysis.measure
- data_juicer.analysis.overall_analysis
- data_juicer.config.config
- data_juicer.core.adapter
- data_juicer.core.analyzer
- data_juicer.core.data.config_validator
- data_juicer.core.data.data_validator
- data_juicer.core.data.dataset_builder
- data_juicer.core.data.dj_dataset
- data_juicer.core.data.load_strategy
- data_juicer.core.data.ray_dataset
- data_juicer.core.data.schema
- data_juicer.core.executor.base
- data_juicer.core.executor.default_executor
- data_juicer.core.executor.factory
- data_juicer.core.executor.ray_executor
- data_juicer.core.exporter
- data_juicer.core.monitor
- data_juicer.core.ray_exporter
- data_juicer.core.tracer
- data_juicer.download.downloader
- data_juicer.download.wikipedia
- data_juicer.format.csv_formatter
- data_juicer.format.empty_formatter
- data_juicer.format.formatter
- data_juicer.format.json_formatter
- data_juicer.format.load
- data_juicer.format.parquet_formatter
- data_juicer.format.text_formatter
- data_juicer.format.tsv_formatter
- data_juicer.ops.aggregator.entity_attribute_aggregator
- data_juicer.ops.aggregator.meta_tags_aggregator
- data_juicer.ops.aggregator.most_relevant_entities_aggregator
- data_juicer.ops.aggregator.nested_aggregator
- data_juicer.ops.base_op
- data_juicer.ops.common.dwpose_func
- data_juicer.ops.common.helper_func
- data_juicer.ops.common.prompt2prompt_pipeline
- data_juicer.ops.deduplicator.document_deduplicator
- data_juicer.ops.deduplicator.document_minhash_deduplicator
- data_juicer.ops.deduplicator.document_simhash_deduplicator
- data_juicer.ops.deduplicator.image_deduplicator
- data_juicer.ops.deduplicator.ray_basic_deduplicator
- data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator
- data_juicer.ops.deduplicator.ray_document_deduplicator
- data_juicer.ops.deduplicator.ray_image_deduplicator
- data_juicer.ops.deduplicator.ray_video_deduplicator
- data_juicer.ops.deduplicator.video_deduplicator
- data_juicer.ops.filter.alphanumeric_filter
- data_juicer.ops.filter.audio_duration_filter
- data_juicer.ops.filter.audio_nmf_snr_filter
- data_juicer.ops.filter.audio_size_filter
- data_juicer.ops.filter.average_line_length_filter
- data_juicer.ops.filter.character_repetition_filter
- data_juicer.ops.filter.flagged_words_filter
- data_juicer.ops.filter.general_field_filter
- data_juicer.ops.filter.image_aesthetics_filter
- data_juicer.ops.filter.image_aspect_ratio_filter
- data_juicer.ops.filter.image_face_count_filter
- data_juicer.ops.filter.image_face_ratio_filter
- data_juicer.ops.filter.image_nsfw_filter
- data_juicer.ops.filter.image_pair_similarity_filter
- data_juicer.ops.filter.image_shape_filter
- data_juicer.ops.filter.image_size_filter
- data_juicer.ops.filter.image_text_matching_filter
- data_juicer.ops.filter.image_text_similarity_filter
- data_juicer.ops.filter.image_watermark_filter
- data_juicer.ops.filter.in_context_influence_filter
- data_juicer.ops.filter.instruction_following_difficulty_filter
- data_juicer.ops.filter.language_id_score_filter
- data_juicer.ops.filter.llm_analysis_filter
- data_juicer.ops.filter.llm_difficulty_score_filter
- data_juicer.ops.filter.llm_perplexity_filter
- data_juicer.ops.filter.llm_quality_score_filter
- data_juicer.ops.filter.llm_task_relevance_filter
- data_juicer.ops.filter.maximum_line_length_filter
- data_juicer.ops.filter.perplexity_filter
- data_juicer.ops.filter.phrase_grounding_recall_filter
- data_juicer.ops.filter.special_characters_filter
- data_juicer.ops.filter.specified_field_filter
- data_juicer.ops.filter.specified_numeric_field_filter
- data_juicer.ops.filter.stopwords_filter
- data_juicer.ops.filter.suffix_filter
- data_juicer.ops.filter.text_action_filter
- data_juicer.ops.filter.text_embd_similarity_filter
- data_juicer.ops.filter.text_entity_dependency_filter
- data_juicer.ops.filter.text_length_filter
- data_juicer.ops.filter.text_pair_similarity_filter
- data_juicer.ops.filter.token_num_filter
- data_juicer.ops.filter.video_aesthetics_filter
- data_juicer.ops.filter.video_aspect_ratio_filter
- data_juicer.ops.filter.video_duration_filter
- data_juicer.ops.filter.video_frames_text_similarity_filter
- data_juicer.ops.filter.video_motion_score_filter
- data_juicer.ops.filter.video_motion_score_raft_filter
- data_juicer.ops.filter.video_nsfw_filter
- data_juicer.ops.filter.video_ocr_area_ratio_filter
- data_juicer.ops.filter.video_resolution_filter
- data_juicer.ops.filter.video_tagging_from_frames_filter
- data_juicer.ops.filter.video_watermark_filter
- data_juicer.ops.filter.word_repetition_filter
- data_juicer.ops.filter.words_num_filter
- data_juicer.ops.grouper.key_value_grouper
- data_juicer.ops.grouper.naive_grouper
- data_juicer.ops.grouper.naive_reverse_grouper
- data_juicer.ops.load
- data_juicer.ops.mapper.annotation.annotation_mapper
- data_juicer.ops.mapper.annotation.human_preference_annotation_mapper
- data_juicer.ops.mapper.audio_add_gaussian_noise_mapper
- data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper
- data_juicer.ops.mapper.calibrate_qa_mapper
- data_juicer.ops.mapper.calibrate_query_mapper
- data_juicer.ops.mapper.calibrate_response_mapper
- data_juicer.ops.mapper.chinese_convert_mapper
- data_juicer.ops.mapper.clean_copyright_mapper
- data_juicer.ops.mapper.clean_email_mapper
- data_juicer.ops.mapper.clean_html_mapper
- data_juicer.ops.mapper.clean_ip_mapper
- data_juicer.ops.mapper.clean_links_mapper
- data_juicer.ops.mapper.detect_character_attributes_mapper
- data_juicer.ops.mapper.detect_character_locations_mapper
- data_juicer.ops.mapper.detect_main_character_mapper
- data_juicer.ops.mapper.dialog_intent_detection_mapper
- data_juicer.ops.mapper.dialog_sentiment_detection_mapper
- data_juicer.ops.mapper.dialog_sentiment_intensity_mapper
- data_juicer.ops.mapper.dialog_topic_detection_mapper
- data_juicer.ops.mapper.download_file_mapper
- data_juicer.ops.mapper.expand_macro_mapper
- data_juicer.ops.mapper.extract_entity_attribute_mapper
- data_juicer.ops.mapper.extract_entity_relation_mapper
- data_juicer.ops.mapper.extract_event_mapper
- data_juicer.ops.mapper.extract_keyword_mapper
- data_juicer.ops.mapper.extract_nickname_mapper
- data_juicer.ops.mapper.extract_support_text_mapper
- data_juicer.ops.mapper.extract_tables_from_html_mapper
- data_juicer.ops.mapper.fix_unicode_mapper
- data_juicer.ops.mapper.generate_qa_from_examples_mapper
- data_juicer.ops.mapper.generate_qa_from_text_mapper
- data_juicer.ops.mapper.image_blur_mapper
- data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper
- data_juicer.ops.mapper.image_captioning_mapper
- data_juicer.ops.mapper.image_detection_yolo_mapper
- data_juicer.ops.mapper.image_diffusion_mapper
- data_juicer.ops.mapper.image_face_blur_mapper
- data_juicer.ops.mapper.image_mmpose_mapper
- data_juicer.ops.mapper.image_remove_background_mapper
- data_juicer.ops.mapper.image_sam_3d_body_mapper
- data_juicer.ops.mapper.image_segment_mapper
- data_juicer.ops.mapper.image_tagging_mapper
- data_juicer.ops.mapper.image_tagging_vlm_mapper
- data_juicer.ops.mapper.imgdiff_difference_area_generator_mapper
- data_juicer.ops.mapper.imgdiff_difference_caption_generator_mapper
- data_juicer.ops.mapper.mllm_mapper
- data_juicer.ops.mapper.nlpaug_en_mapper
- data_juicer.ops.mapper.nlpcda_zh_mapper
- data_juicer.ops.mapper.optimize_prompt_mapper
- data_juicer.ops.mapper.optimize_qa_mapper
- data_juicer.ops.mapper.optimize_query_mapper
- data_juicer.ops.mapper.optimize_response_mapper
- data_juicer.ops.mapper.pair_preference_mapper
- data_juicer.ops.mapper.punctuation_normalization_mapper
- data_juicer.ops.mapper.python_file_mapper
- data_juicer.ops.mapper.python_lambda_mapper
- data_juicer.ops.mapper.query_intent_detection_mapper
- data_juicer.ops.mapper.query_sentiment_detection_mapper
- data_juicer.ops.mapper.query_topic_detection_mapper
- data_juicer.ops.mapper.relation_identity_mapper
- data_juicer.ops.mapper.remove_bibliography_mapper
- data_juicer.ops.mapper.remove_comments_mapper
- data_juicer.ops.mapper.remove_header_mapper
- data_juicer.ops.mapper.remove_long_words_mapper
- data_juicer.ops.mapper.remove_non_chinese_character_mapper
- data_juicer.ops.mapper.remove_repeat_sentences_mapper
- data_juicer.ops.mapper.remove_specific_chars_mapper
- data_juicer.ops.mapper.remove_table_text_mapper
- data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper
- data_juicer.ops.mapper.replace_content_mapper
- data_juicer.ops.mapper.sdxl_prompt2prompt_mapper
- data_juicer.ops.mapper.sentence_augmentation_mapper
- data_juicer.ops.mapper.sentence_split_mapper
- data_juicer.ops.mapper.text_chunk_mapper
- data_juicer.ops.mapper.text_tagging_by_prompt_mapper
- data_juicer.ops.mapper.vggt_mapper
- data_juicer.ops.mapper.video_captioning_from_audio_mapper
- data_juicer.ops.mapper.video_captioning_from_frames_mapper
- data_juicer.ops.mapper.video_captioning_from_summarizer_mapper
- data_juicer.ops.mapper.video_captioning_from_video_mapper
- data_juicer.ops.mapper.video_captioning_from_vlm_mapper
- data_juicer.ops.mapper.video_depth_estimation_mapper
- data_juicer.ops.mapper.video_extract_frames_mapper
- data_juicer.ops.mapper.video_face_blur_mapper
- data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper
- data_juicer.ops.mapper.video_hand_reconstruction_mapper
- data_juicer.ops.mapper.video_object_segmenting_mapper
- data_juicer.ops.mapper.video_remove_watermark_mapper
- data_juicer.ops.mapper.video_resize_aspect_ratio_mapper
- data_juicer.ops.mapper.video_resize_resolution_mapper
- data_juicer.ops.mapper.video_split_by_duration_mapper
- data_juicer.ops.mapper.video_split_by_key_frame_mapper
- data_juicer.ops.mapper.video_split_by_scene_mapper
- data_juicer.ops.mapper.video_tagging_from_audio_mapper
- data_juicer.ops.mapper.video_tagging_from_frames_mapper
- data_juicer.ops.mapper.video_whole_body_pose_estimation_mapper
- data_juicer.ops.mapper.whitespace_normalization_mapper
- data_juicer.ops.mixins
- data_juicer.ops.op_fusion
- data_juicer.ops.pipeline.llm_inference_with_ray_vllm_pipeline
- data_juicer.ops.pipeline.ray_vllm_pipeline
- data_juicer.ops.pipeline.vlm_inference_with_ray_vllm_pipeline
- data_juicer.ops.selector.frequency_specified_field_selector
- data_juicer.ops.selector.random_selector
- data_juicer.ops.selector.range_specified_field_selector
- data_juicer.ops.selector.tags_specified_field_selector
- data_juicer.ops.selector.topk_specified_field_selector
- data_juicer.tools.DJ_mcp_granular_ops
- data_juicer.tools.DJ_mcp_recipe_flow
- data_juicer.tools.hpo.execute_hpo_3sigma
- data_juicer.tools.hpo.objects
- data_juicer.tools.mcp_server
- data_juicer.tools.mcp_tool
- data_juicer.tools.op_search
- data_juicer.tools.quality_classifier.eval
- data_juicer.tools.quality_classifier.predict
- data_juicer.tools.quality_classifier.qc_utils
- data_juicer.tools.quality_classifier.train
- data_juicer.utils.asset_utils
- data_juicer.utils.cache_utils
- data_juicer.utils.ckpt_utils
- data_juicer.utils.common_utils
- data_juicer.utils.compress
- data_juicer.utils.constant
- data_juicer.utils.file_utils
- data_juicer.utils.fingerprint_utils
- data_juicer.utils.lazy_loader
- data_juicer.utils.logger_utils
- data_juicer.utils.mm_utils
- data_juicer.utils.model_utils
- data_juicer.utils.nltk_utils
- data_juicer.utils.process_utils
- data_juicer.utils.ray_utils
- data_juicer.utils.registry
- data_juicer.utils.resource_utils
- data_juicer.utils.s3_utils
- data_juicer.utils.sample
- data_juicer.utils.unittest_utils
- data_juicer.utils.video_utils
- data_juicer.utils.webdataset_utils