Index _ | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | Z _ __init__() (data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method) (data_juicer.analysis.ColumnWiseAnalysis method) (data_juicer.analysis.correlation_analysis.CorrelationAnalysis method) (data_juicer.analysis.CorrelationAnalysis method) (data_juicer.analysis.diversity_analysis.DiversityAnalysis method) (data_juicer.analysis.DiversityAnalysis method) (data_juicer.analysis.overall_analysis.OverallAnalysis method) (data_juicer.analysis.OverallAnalysis method) (data_juicer.download.downloader.DocumentDownloader method) (data_juicer.download.downloader.DocumentExtractor method) (data_juicer.download.downloader.DocumentIterator method) (data_juicer.download.wikipedia.WikipediaDownloader method) (data_juicer.download.wikipedia.WikipediaExtractor method) (data_juicer.download.wikipedia.WikipediaIterator method) (data_juicer.format.csv_formatter.CsvFormatter method) (data_juicer.format.CsvFormatter method) (data_juicer.format.empty_formatter.EmptyFormatter method) (data_juicer.format.empty_formatter.RayEmptyFormatter method) (data_juicer.format.EmptyFormatter method) (data_juicer.format.formatter.LocalFormatter method) (data_juicer.format.formatter.RemoteFormatter method) (data_juicer.format.json_formatter.JsonFormatter method) (data_juicer.format.JsonFormatter method) (data_juicer.format.LocalFormatter method) (data_juicer.format.parquet_formatter.ParquetFormatter method) (data_juicer.format.ParquetFormatter method) (data_juicer.format.RayEmptyFormatter method) (data_juicer.format.RemoteFormatter method) (data_juicer.format.text_formatter.TextFormatter method) (data_juicer.format.TextFormatter method) (data_juicer.format.tsv_formatter.TsvFormatter method) (data_juicer.format.TsvFormatter method) (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator method) (data_juicer.ops.aggregator.EntityAttributeAggregator method) (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator method) (data_juicer.ops.aggregator.MetaTagsAggregator method) (data_juicer.ops.aggregator.most_relevant_entities_aggregator.MostRelevantEntitiesAggregator method) (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator method) (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator method) (data_juicer.ops.aggregator.NestedAggregator method) (data_juicer.ops.base_op.Aggregator method) (data_juicer.ops.base_op.Deduplicator method) (data_juicer.ops.base_op.Filter method) (data_juicer.ops.base_op.Grouper method) (data_juicer.ops.base_op.Mapper method) (data_juicer.ops.base_op.OP method) (data_juicer.ops.base_op.Selector method) (data_juicer.ops.common.dwpose_func.DWposeDetector method) (data_juicer.ops.common.dwpose_func.Wholebody method) (data_juicer.ops.common.helper_func.UnionFind method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControl method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControlEdit method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionRefine method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionReplace method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionReweight method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionStore method) (data_juicer.ops.common.prompt2prompt_pipeline.LocalBlend method) (data_juicer.ops.common.prompt2prompt_pipeline.P2PCrossAttnProcessor method) (data_juicer.ops.common.prompt2prompt_pipeline.ScoreParams method) (data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator method) (data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator method) (data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator method) (data_juicer.ops.deduplicator.DocumentDeduplicator method) (data_juicer.ops.deduplicator.DocumentMinhashDeduplicator method) (data_juicer.ops.deduplicator.DocumentSimhashDeduplicator method) (data_juicer.ops.deduplicator.image_deduplicator.ImageDeduplicator method) (data_juicer.ops.deduplicator.ImageDeduplicator method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.ActorBackend method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.Backend method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.DedupSet method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.RedisBackend method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.EdgeBuffer method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.GPUMinHashActor method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.IdGenerator method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.ray_document_deduplicator.RayDocumentDeduplicator method) (data_juicer.ops.deduplicator.ray_image_deduplicator.RayImageDeduplicator method) (data_juicer.ops.deduplicator.ray_video_deduplicator.RayVideoDeduplicator method) (data_juicer.ops.deduplicator.RayBasicDeduplicator method) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.RayDocumentDeduplicator method) (data_juicer.ops.deduplicator.RayImageDeduplicator method) (data_juicer.ops.deduplicator.RayVideoDeduplicator method) (data_juicer.ops.deduplicator.video_deduplicator.VideoDeduplicator method) (data_juicer.ops.deduplicator.VideoDeduplicator method) (data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter method) (data_juicer.ops.filter.audio_duration_filter.AudioDurationFilter method) (data_juicer.ops.op_fusion.FusedFilter method) (data_juicer.ops.op_fusion.GeneralFusedOP method) (data_juicer.utils.cache_utils.DatasetCacheControl method) (data_juicer.utils.ckpt_utils.CheckpointManager method) (data_juicer.utils.compress.CacheCompressManager method) (data_juicer.utils.compress.CompressManager method) (data_juicer.utils.fingerprint_utils.Hasher method) (data_juicer.utils.lazy_loader.LazyLoader method) (data_juicer.utils.logger_utils.StreamToLoguru method) (data_juicer.utils.model_utils.ChatAPIModel method) (data_juicer.utils.model_utils.EmbeddingAPIModel method) (data_juicer.utils.registry.Registry method) (data_juicer.utils.video_utils.AVReader method) (data_juicer.utils.video_utils.Clip method) (data_juicer.utils.video_utils.DecordReader method) (data_juicer.utils.video_utils.FFmpegReader method) (data_juicer.utils.video_utils.Frames method) (data_juicer.utils.video_utils.VideoMetadata method) (data_juicer.utils.video_utils.VideoReader method) A ActorBackend (class in data_juicer.ops.deduplicator.ray_basic_deduplicator) add_key_value_pairs() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) add_parameters() (data_juicer.ops.base_op.OP method) add_suffix_to_filename() (in module data_juicer.utils.file_utils) add_suffixes() (in module data_juicer.format.formatter) Aggregator (class in data_juicer.ops.base_op) alnum_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) alpha_token_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) AlphanumericFilter (class in data_juicer.ops.filter.alphanumeric_filter) analyze() (data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method) (data_juicer.analysis.ColumnWiseAnalysis method) (data_juicer.analysis.correlation_analysis.CorrelationAnalysis method) (data_juicer.analysis.CorrelationAnalysis method) (data_juicer.analysis.diversity_analysis.DiversityAnalysis method) (data_juicer.analysis.DiversityAnalysis method) (data_juicer.analysis.overall_analysis.OverallAnalysis method) (data_juicer.analysis.OverallAnalysis method) analyze_resource_util_list() (data_juicer.core.monitor.Monitor static method) analyze_single_resource_util() (data_juicer.core.monitor.Monitor static method) annotate_heatmap() (in module data_juicer.analysis.correlation_analysis) aspect_ratios (data_juicer.utils.constant.StatsKeysConstant attribute) AttentionControl (class in data_juicer.ops.common.prompt2prompt_pipeline) AttentionControlEdit (class in data_juicer.ops.common.prompt2prompt_pipeline) AttentionRefine (class in data_juicer.ops.common.prompt2prompt_pipeline) AttentionReplace (class in data_juicer.ops.common.prompt2prompt_pipeline) AttentionReweight (class in data_juicer.ops.common.prompt2prompt_pipeline) AttentionStore (class in data_juicer.ops.common.prompt2prompt_pipeline) attribute_descriptions (data_juicer.utils.constant.MetaKeys attribute) attribute_summary() (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator method) (data_juicer.ops.aggregator.EntityAttributeAggregator method) attribute_support_texts (data_juicer.utils.constant.MetaKeys attribute) attributes (data_juicer.utils.constant.MetaKeys attribute) audio (data_juicer.utils.mm_utils.SpecialTokens attribute) audio_duration (data_juicer.utils.constant.StatsKeysConstant attribute) audio_nmf_snr (data_juicer.utils.constant.StatsKeysConstant attribute) audio_sizes (data_juicer.utils.constant.StatsKeysConstant attribute) AudioDurationFilter (class in data_juicer.ops.filter.audio_duration_filter) AV_STREAM_THREAD_TYPE (in module data_juicer.utils.mm_utils) available_gpu_memories() (in module data_juicer.utils.resource_utils) available_memories() (in module data_juicer.utils.resource_utils) avg_line_length (data_juicer.utils.constant.StatsKeysConstant attribute) avg_split_string_list_under_limit() (in module data_juicer.utils.common_utils) AVReader (class in data_juicer.utils.video_utils) B Backend (class in data_juicer.ops.deduplicator.ray_basic_deduplicator) balanced_union_find() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) band_minhash() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method) BaseCompressor (class in data_juicer.utils.compress) BaseFormatter (class in data_juicer.format.formatter) batch_meta (data_juicer.utils.constant.Fields attribute) BatchMetaKeys (class in data_juicer.utils.constant) bbox_tag (data_juicer.utils.constant.MetaKeys attribute) bbox_xyxy2cs() (in module data_juicer.ops.common.dwpose_func) between_steps() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControl method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionStore method) BTSUnionFind (class in data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator) byte_size_to_size_str() (in module data_juicer.utils.file_utils) C CacheCompressManager (class in data_juicer.utils.compress) calc_minhash() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method) calculate_hash() (data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator method) (data_juicer.ops.deduplicator.ray_document_deduplicator.RayDocumentDeduplicator method) (data_juicer.ops.deduplicator.ray_image_deduplicator.RayImageDeduplicator method) (data_juicer.ops.deduplicator.ray_video_deduplicator.RayVideoDeduplicator method) (data_juicer.ops.deduplicator.RayBasicDeduplicator method) (data_juicer.ops.deduplicator.RayDocumentDeduplicator method) (data_juicer.ops.deduplicator.RayImageDeduplicator method) (data_juicer.ops.deduplicator.RayVideoDeduplicator method) calculate_np() (in module data_juicer.utils.process_utils) calculate_ray_np() (in module data_juicer.utils.process_utils) calculate_resized_dimensions() (in module data_juicer.utils.mm_utils) catch_map_batches_exception() (in module data_juicer.ops.base_op) catch_map_single_exception() (in module data_juicer.ops.base_op) category_to_hist() (data_juicer.analysis.measure.RelatedTTestMeasure static method) char_rep_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) ChatAPIModel (class in data_juicer.utils.model_utils) check_and_initialize_ray() (in module data_juicer.utils.ray_utils) check_ckpt() (data_juicer.utils.ckpt_utils.CheckpointManager method) check_inputs() (data_juicer.ops.common.prompt2prompt_pipeline.Prompt2PromptPipeline method) check_model() (in module data_juicer.utils.model_utils) check_model_home() (in module data_juicer.utils.model_utils) check_op_method_param() (in module data_juicer.utils.common_utils) check_ops_to_skip() (data_juicer.utils.ckpt_utils.CheckpointManager method) check_packages() (data_juicer.utils.lazy_loader.LazyLoader class method) check_time_span() (data_juicer.utils.video_utils.VideoReader method) CheckpointManager (class in data_juicer.utils.ckpt_utils) class_label_tag (data_juicer.utils.constant.MetaKeys attribute) clean_nltk_cache() (in module data_juicer.utils.nltk_utils) cleanup_cache_files() (data_juicer.utils.compress.CacheCompressManager method) cleanup_compressed_cache_files() (in module data_juicer.utils.compress) clear() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.EdgeBuffer method) Clip (class in data_juicer.utils.video_utils) close() (data_juicer.utils.video_utils.AVReader method) (data_juicer.utils.video_utils.DecordReader method) (data_juicer.utils.video_utils.FFmpegReader method) (data_juicer.utils.video_utils.VideoReader method) close_video() (in module data_juicer.utils.mm_utils) ColumnWiseAnalysis (class in data_juicer.analysis) (class in data_juicer.analysis.column_wise_analysis) communication() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) compress() (data_juicer.utils.compress.BaseCompressor static method) (data_juicer.utils.compress.CacheCompressManager method) (data_juicer.utils.compress.CompressManager method) (data_juicer.utils.compress.Compressor class method) (data_juicer.utils.compress.GzipCompressor static method) (data_juicer.utils.compress.Lz4Compressor static method) (data_juicer.utils.compress.ZstdCompressor static method) (in module data_juicer.utils.compress) CompressionOff (class in data_juicer.utils.compress) CompressManager (class in data_juicer.utils.compress) Compressor (class in data_juicer.utils.compress) compressors (data_juicer.utils.compress.Compressor attribute) compute() (data_juicer.analysis.diversity_analysis.DiversityAnalysis method) (data_juicer.analysis.DiversityAnalysis method) compute_hash() (data_juicer.ops.base_op.Deduplicator method) (data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator method) (data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator method) (data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator method) (data_juicer.ops.deduplicator.DocumentDeduplicator method) (data_juicer.ops.deduplicator.DocumentMinhashDeduplicator method) (data_juicer.ops.deduplicator.DocumentSimhashDeduplicator method) (data_juicer.ops.deduplicator.image_deduplicator.ImageDeduplicator method) (data_juicer.ops.deduplicator.ImageDeduplicator method) (data_juicer.ops.deduplicator.video_deduplicator.VideoDeduplicator method) (data_juicer.ops.deduplicator.VideoDeduplicator method) compute_minhash() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.GPUMinHashActor method) compute_stats_batched() (data_juicer.ops.base_op.Filter method) (data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter method) (data_juicer.ops.op_fusion.FusedFilter method) compute_stats_single() (data_juicer.ops.base_op.Filter method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator method) (data_juicer.ops.deduplicator.RayBasicDeduplicator method) (data_juicer.ops.filter.audio_duration_filter.AudioDurationFilter method) config_backup() (in module data_juicer.config.config) context (data_juicer.utils.constant.Fields attribute) convert_arrow_to_python() (in module data_juicer.ops.base_op) convert_dict_list_to_list_dict() (in module data_juicer.ops.base_op) convert_list_dict_to_dict_list() (in module data_juicer.ops.base_op) copy_data() (in module data_juicer.utils.file_utils) CorrelationAnalysis (class in data_juicer.analysis) (class in data_juicer.analysis.correlation_analysis) cpu_count() (in module data_juicer.utils.resource_utils) create_controller() (in module data_juicer.ops.common.prompt2prompt_pipeline) create_directory_if_not_exists() (in module data_juicer.utils.file_utils) create_physical_resource_alias() (in module data_juicer.utils.nltk_utils) create_pyarrow_s3_filesystem() (in module data_juicer.utils.s3_utils) create_video_reader() (in module data_juicer.utils.video_utils) CrossEntropyMeasure (class in data_juicer.analysis.measure) CsvFormatter (class in data_juicer.format) (class in data_juicer.format.csv_formatter) cuda_device_count() (in module data_juicer.utils.resource_utils) cut_video_by_seconds() (in module data_juicer.utils.mm_utils) D data_juicer module data_juicer.analysis module data_juicer.analysis.column_wise_analysis module data_juicer.analysis.correlation_analysis module data_juicer.analysis.diversity_analysis module data_juicer.analysis.measure module data_juicer.analysis.overall_analysis module data_juicer.config module data_juicer.config.config module data_juicer.core.monitor module data_juicer.download module data_juicer.download.commoncrawl module data_juicer.download.downloader module data_juicer.download.wikipedia module data_juicer.format module data_juicer.format.csv_formatter module data_juicer.format.empty_formatter module data_juicer.format.formatter module data_juicer.format.json_formatter module data_juicer.format.load module data_juicer.format.parquet_formatter module data_juicer.format.text_formatter module data_juicer.format.tsv_formatter module data_juicer.ops.aggregator module data_juicer.ops.aggregator.entity_attribute_aggregator module data_juicer.ops.aggregator.meta_tags_aggregator module data_juicer.ops.aggregator.most_relevant_entities_aggregator module data_juicer.ops.aggregator.nested_aggregator module data_juicer.ops.base_op module data_juicer.ops.common module data_juicer.ops.common.dwpose_func module data_juicer.ops.common.helper_func module data_juicer.ops.common.prompt2prompt_pipeline module data_juicer.ops.common.special_characters module data_juicer.ops.deduplicator module data_juicer.ops.deduplicator.document_deduplicator module data_juicer.ops.deduplicator.document_minhash_deduplicator module data_juicer.ops.deduplicator.document_simhash_deduplicator module data_juicer.ops.deduplicator.image_deduplicator module data_juicer.ops.deduplicator.ray_basic_deduplicator module data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator module data_juicer.ops.deduplicator.ray_document_deduplicator module data_juicer.ops.deduplicator.ray_image_deduplicator module data_juicer.ops.deduplicator.ray_video_deduplicator module data_juicer.ops.deduplicator.video_deduplicator module data_juicer.ops.filter.alphanumeric_filter module data_juicer.ops.filter.audio_duration_filter module data_juicer.ops.load module data_juicer.ops.op_fusion module data_juicer.tools module data_juicer.tools.hpo module data_juicer.tools.mcp_server module data_juicer.tools.quality_classifier module data_juicer.tools.quality_classifier.eval module data_juicer.tools.quality_classifier.predict module data_juicer.tools.quality_classifier.qc_utils module data_juicer.tools.quality_classifier.train module data_juicer.utils module data_juicer.utils.asset_utils module data_juicer.utils.availability_utils module data_juicer.utils.cache_utils module data_juicer.utils.ckpt_utils module data_juicer.utils.common_utils module data_juicer.utils.compress module data_juicer.utils.constant module data_juicer.utils.file_utils module data_juicer.utils.fingerprint_utils module data_juicer.utils.lazy_loader module data_juicer.utils.logger_utils module data_juicer.utils.mm_utils module data_juicer.utils.model_utils module data_juicer.utils.nltk_utils module data_juicer.utils.process_utils module data_juicer.utils.ray_utils module data_juicer.utils.registry module data_juicer.utils.resource_utils module data_juicer.utils.s3_utils module data_juicer.utils.sample module data_juicer.utils.video_utils module data_juicer.utils.webdataset_utils module dataset_cache_control() (in module data_juicer.utils.cache_utils) DatasetCacheControl (class in data_juicer.utils.cache_utils) decode() (in module data_juicer.ops.common.dwpose_func) decompress() (data_juicer.utils.compress.CacheCompressManager method) (data_juicer.utils.compress.CompressManager method) (in module data_juicer.utils.compress) DecordReader (class in data_juicer.utils.video_utils) Deduplicator (class in data_juicer.ops.base_op) DedupSet (class in data_juicer.ops.deduplicator.ray_basic_deduplicator) DEFAULT_EXAMPLE_PROMPT (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator attribute) (data_juicer.ops.aggregator.EntityAttributeAggregator attribute) DEFAULT_INPUT_TEMPLATE (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator attribute) (data_juicer.ops.aggregator.EntityAttributeAggregator attribute) (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.most_relevant_entities_aggregator.MostRelevantEntitiesAggregator attribute) (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator attribute) (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator attribute) (data_juicer.ops.aggregator.NestedAggregator attribute) DEFAULT_OUTPUT_PATTERN (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.most_relevant_entities_aggregator.MostRelevantEntitiesAggregator attribute) (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator attribute) DEFAULT_OUTPUT_PATTERN_TEMPLATE (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator attribute) (data_juicer.ops.aggregator.EntityAttributeAggregator attribute) DEFAULT_SUB_DOC_TEMPLATE (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator attribute) (data_juicer.ops.aggregator.NestedAggregator attribute) DEFAULT_SYSTEM_PROMPT (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator attribute) (data_juicer.ops.aggregator.NestedAggregator attribute) DEFAULT_SYSTEM_TEMPLATE (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator attribute) (data_juicer.ops.aggregator.EntityAttributeAggregator attribute) (data_juicer.ops.aggregator.most_relevant_entities_aggregator.MostRelevantEntitiesAggregator attribute) (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator attribute) DEFAULT_TAG_TEMPLATE (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.MetaTagsAggregator attribute) DEFAULT_TARGET_TAG_TEMPLATE (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator attribute) (data_juicer.ops.aggregator.MetaTagsAggregator attribute) demo_postprocess() (in module data_juicer.ops.common.dwpose_func) deprecated() (in module data_juicer.utils.common_utils) detect_faces() (in module data_juicer.utils.mm_utils) dialog_intent_labels (data_juicer.utils.constant.MetaKeys attribute) dialog_intent_labels_analysis (data_juicer.utils.constant.MetaKeys attribute) dialog_sentiment_intensity (data_juicer.utils.constant.MetaKeys attribute) dialog_sentiment_intensity_analysis (data_juicer.utils.constant.MetaKeys attribute) dialog_sentiment_labels (data_juicer.utils.constant.MetaKeys attribute) dialog_sentiment_labels_analysis (data_juicer.utils.constant.MetaKeys attribute) dialog_topic_labels (data_juicer.utils.constant.MetaKeys attribute) dialog_topic_labels_analysis (data_juicer.utils.constant.MetaKeys attribute) dict_to_hash() (in module data_juicer.utils.common_utils) dispatch (data_juicer.utils.fingerprint_utils.Hasher attribute) display_config() (in module data_juicer.config.config) distribute_edge() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) DiversityAnalysis (class in data_juicer.analysis) (class in data_juicer.analysis.diversity_analysis) dj_configs (data_juicer.utils.constant.JobRequiredKeys attribute) DocumentDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.document_deduplicator) DocumentDownloader (class in data_juicer.download.downloader) DocumentExtractor (class in data_juicer.download.downloader) DocumentIterator (class in data_juicer.download.downloader) DocumentMinhashDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.document_minhash_deduplicator) DocumentMinhashDeduplicatorWithUid (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.document_minhash_deduplicator) DocumentSimhashDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.document_simhash_deduplicator) download() (data_juicer.download.downloader.DocumentDownloader method) (data_juicer.download.wikipedia.WikipediaDownloader method) download_and_extract() (in module data_juicer.download.downloader) download_file() (in module data_juicer.utils.file_utils) download_wikipedia() (in module data_juicer.download.wikipedia) draw_bodypose() (in module data_juicer.ops.common.dwpose_func) draw_box() (data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method) (data_juicer.analysis.ColumnWiseAnalysis method) draw_facepose() (in module data_juicer.ops.common.dwpose_func) draw_handpose() (in module data_juicer.ops.common.dwpose_func) draw_heatmap() (in module data_juicer.analysis.correlation_analysis) draw_hist() (data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method) (data_juicer.analysis.ColumnWiseAnalysis method) draw_pose() (in module data_juicer.ops.common.dwpose_func) draw_resource_util_graph() (data_juicer.core.monitor.Monitor static method) draw_wordcloud() (data_juicer.analysis.column_wise_analysis.ColumnWiseAnalysis method) (data_juicer.analysis.ColumnWiseAnalysis method) dup_idx() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) duration (data_juicer.utils.video_utils.VideoMetadata attribute) DWposeDetector (class in data_juicer.ops.common.dwpose_func) DYNAMIC_FIELDS (data_juicer.core.monitor.Monitor attribute) E edge_redistribution() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) EdgeBuffer (class in data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator) EmbeddingAPIModel (class in data_juicer.utils.model_utils) EMPTY_HASH_VALUE (data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator attribute) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator attribute) (data_juicer.ops.deduplicator.RayBasicDeduplicator attribute) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator attribute) empty_history() (data_juicer.ops.base_op.OP method) EmptyControl (class in data_juicer.ops.common.prompt2prompt_pipeline) EmptyFormatter (class in data_juicer.format) (class in data_juicer.format.empty_formatter) encoded_data (data_juicer.utils.video_utils.Clip attribute) ensure_nltk_resource() (in module data_juicer.utils.nltk_utils) entity (data_juicer.utils.constant.MetaKeys attribute) entity_attribute (data_juicer.utils.constant.BatchMetaKeys attribute) entity_description (data_juicer.utils.constant.MetaKeys attribute) entity_name (data_juicer.utils.constant.MetaKeys attribute) entity_type (data_juicer.utils.constant.MetaKeys attribute) EntityAttributeAggregator (class in data_juicer.ops.aggregator) (class in data_juicer.ops.aggregator.entity_attribute_aggregator) EntropyMeasure (class in data_juicer.analysis.measure) eoc (data_juicer.utils.mm_utils.SpecialTokens attribute) eval() (in module data_juicer.tools.quality_classifier.qc_utils) event_description (data_juicer.utils.constant.MetaKeys attribute) expand_outdir_and_mkdir() (in module data_juicer.utils.file_utils) export_config() (in module data_juicer.config) (in module data_juicer.config.config) export_result() (in module data_juicer.tools.quality_classifier.qc_utils) extra_configs (data_juicer.utils.constant.JobRequiredKeys attribute) extract() (data_juicer.download.downloader.DocumentExtractor method) (data_juicer.download.wikipedia.WikipediaExtractor method) (data_juicer.utils.compress.Extractor class method) extract_audio_from_video() (in module data_juicer.utils.mm_utils) extract_clip() (data_juicer.utils.video_utils.AVReader method) (data_juicer.utils.video_utils.DecordReader method) (data_juicer.utils.video_utils.FFmpegReader method) (data_juicer.utils.video_utils.VideoReader method) extract_frames() (data_juicer.utils.video_utils.AVReader method) (data_juicer.utils.video_utils.DecordReader method) (data_juicer.utils.video_utils.FFmpegReader method) (data_juicer.utils.video_utils.VideoReader method) extract_key_frames() (in module data_juicer.utils.mm_utils) extract_key_frames_by_seconds() (in module data_juicer.utils.mm_utils) extract_keyframes() (data_juicer.utils.video_utils.AVReader method) (data_juicer.utils.video_utils.DecordReader method) (data_juicer.utils.video_utils.FFmpegReader method) (data_juicer.utils.video_utils.VideoReader method) extract_txt_from_docx() (in module data_juicer.format.text_formatter) extract_txt_from_pdf() (in module data_juicer.format.text_formatter) extract_video_frames_uniformly() (in module data_juicer.utils.mm_utils) extract_video_frames_uniformly_by_seconds() (in module data_juicer.utils.mm_utils) Extractor (class in data_juicer.utils.compress) F face_counts (data_juicer.utils.constant.StatsKeysConstant attribute) face_detections (data_juicer.utils.constant.StatsKeysConstant attribute) face_ratios (data_juicer.utils.constant.StatsKeysConstant attribute) faceDetect() (in module data_juicer.ops.common.dwpose_func) FFmpegReader (class in data_juicer.utils.video_utils) Fields (class in data_juicer.utils.constant) FileLock (class in data_juicer.utils.compress) fileno() (data_juicer.utils.logger_utils.StreamToLoguru method) Filter (class in data_juicer.ops.base_op) filter_arguments() (in module data_juicer.utils.model_utils) filter_with_union_find() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method) find() (data_juicer.ops.common.helper_func.UnionFind method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) find_files_with_suffix() (in module data_juicer.utils.file_utils) find_root_verb_and_its_dobj() (in module data_juicer.analysis.diversity_analysis) find_root_verb_and_its_dobj_in_string() (in module data_juicer.analysis.diversity_analysis) flagged_words_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) flush() (data_juicer.utils.logger_utils.StreamToLoguru method) flush_key_value_pairs() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) follow_read() (in module data_juicer.utils.file_utils) format_cache_file_name() (data_juicer.utils.compress.CacheCompressManager method) forward() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControl method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControlEdit method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionStore method) (data_juicer.ops.common.prompt2prompt_pipeline.EmptyControl method) fps (data_juicer.utils.video_utils.VideoMetadata attribute) Frames (class in data_juicer.utils.video_utils) frames (data_juicer.utils.video_utils.Clip attribute) (data_juicer.utils.video_utils.Frames attribute) free_models() (in module data_juicer.utils.model_utils) fuse_filter_group() (in module data_juicer.ops.op_fusion) fuse_operators() (in module data_juicer.ops.op_fusion) FusedFilter (class in data_juicer.ops.op_fusion) G general_field_filter_condition (data_juicer.utils.constant.StatsKeysConstant attribute) GeneralFusedOP (class in data_juicer.ops.op_fusion) generate_fingerprint() (in module data_juicer.utils.fingerprint_utils) get() (data_juicer.utils.registry.Registry method) get_access_log() (data_juicer.utils.constant.StatsKeysMeta method) get_aligned_sequences() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_all_dependencies() (data_juicer.utils.lazy_loader.LazyLoader class method) get_all_files_paths_under() (in module data_juicer.utils.file_utils) get_arxiv_urls() (in module data_juicer.download.downloader) get_average_attention() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionStore method) get_aws_credentials() (in module data_juicer.utils.s3_utils) get_backup_model_link() (in module data_juicer.utils.model_utils) get_caller_name() (in module data_juicer.utils.logger_utils) get_cpu_utilization() (in module data_juicer.utils.resource_utils) get_decoded_frames_from_video() (in module data_juicer.utils.mm_utils) get_default_cfg() (in module data_juicer.config) (in module data_juicer.config.config) get_diversity() (in module data_juicer.analysis.diversity_analysis) get_edges() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.EdgeBuffer method) get_empty_store() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionStore static method) get_equalizer() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_file_size() (in module data_juicer.utils.mm_utils) get_hash_method() (in module data_juicer.ops.deduplicator.image_deduplicator) (in module data_juicer.ops.deduplicator.ray_image_deduplicator) get_init_configs() (in module data_juicer.config) (in module data_juicer.config.config) get_keep_boolean() (data_juicer.ops.base_op.Filter method) get_keep_method_udf() (in module data_juicer.tools.quality_classifier.qc_utils) get_key_frame_seconds() (in module data_juicer.utils.mm_utils) get_left_process_list() (data_juicer.utils.ckpt_utils.CheckpointManager method) get_log_file_path() (in module data_juicer.utils.logger_utils) get_mapper() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_matrix() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_metadata() (data_juicer.utils.video_utils.AVReader method) (data_juicer.utils.video_utils.DecordReader method) (data_juicer.utils.video_utils.FFmpegReader method) (data_juicer.utils.video_utils.VideoReader method) get_min_cuda_memory() (in module data_juicer.utils.process_utils) get_model() (in module data_juicer.utils.model_utils) get_next_id() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.IdGenerator method) get_package_name() (data_juicer.utils.lazy_loader.LazyLoader class method) get_ray_nodes_info() (in module data_juicer.utils.ray_utils) get_refinement_mapper() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_remote_classes() (in module data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator) get_remote_dedup_set() (in module data_juicer.ops.deduplicator.ray_basic_deduplicator) get_replacement_mapper() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_replacement_mapper_() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_row_col() (in module data_juicer.analysis.column_wise_analysis) get_sentences_from_document() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) get_simcc_maximum() (in module data_juicer.ops.common.dwpose_func) get_special_tokens() (in module data_juicer.utils.mm_utils) get_time_words_attention_alpha() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_toml_file_path() (in module data_juicer.utils.lazy_loader) get_traceback_matrix() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_uv_lock_path() (in module data_juicer.utils.lazy_loader) get_video_duration() (in module data_juicer.utils.mm_utils) get_warp_matrix() (in module data_juicer.ops.common.dwpose_func) get_wikipedia_urls() (in module data_juicer.download.downloader) get_word_inds() (in module data_juicer.ops.common.prompt2prompt_pipeline) get_words_from_document() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) getvalue() (data_juicer.utils.logger_utils.StreamToLoguru method) GiB (data_juicer.utils.file_utils.Sizes attribute) global_align() (in module data_juicer.ops.common.prompt2prompt_pipeline) GPUMinHashActor (class in data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator) Grouper (class in data_juicer.ops.base_op) GzipCompressor (class in data_juicer.utils.compress) H hand_reconstruction_tags (data_juicer.utils.constant.MetaKeys attribute) handDetect() (in module data_juicer.ops.common.dwpose_func) hash (data_juicer.utils.constant.HashKeys attribute) hash() (data_juicer.utils.fingerprint_utils.Hasher class method) hash_bytes() (data_juicer.utils.fingerprint_utils.Hasher class method) hash_default() (data_juicer.utils.fingerprint_utils.Hasher class method) Hasher (class in data_juicer.utils.fingerprint_utils) HashKeys (class in data_juicer.utils.constant) height (data_juicer.utils.video_utils.VideoMetadata attribute) hexdigest() (data_juicer.utils.fingerprint_utils.Hasher method) HiddenPrints (class in data_juicer.utils.logger_utils) hook (data_juicer.utils.constant.JobRequiredKeys attribute) html_tables (data_juicer.utils.constant.MetaKeys attribute) I id (data_juicer.utils.video_utils.Clip attribute) IdGenerator (class in data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator) ifd_score (data_juicer.utils.constant.StatsKeysConstant attribute) image (data_juicer.utils.mm_utils.SpecialTokens attribute) image_aesthetics_scores (data_juicer.utils.constant.StatsKeysConstant attribute) image_byte_to_base64() (in module data_juicer.utils.mm_utils) image_height (data_juicer.utils.constant.StatsKeysConstant attribute) image_nsfw_score (data_juicer.utils.constant.StatsKeysConstant attribute) image_pair_similarity (data_juicer.utils.constant.StatsKeysConstant attribute) image_path_to_base64() (in module data_juicer.utils.mm_utils) image_sizes (data_juicer.utils.constant.StatsKeysConstant attribute) image_tags (data_juicer.utils.constant.MetaKeys attribute) image_text_matching_score (data_juicer.utils.constant.StatsKeysConstant attribute) image_text_similarity (data_juicer.utils.constant.StatsKeysConstant attribute) image_watermark_prob (data_juicer.utils.constant.StatsKeysConstant attribute) image_width (data_juicer.utils.constant.StatsKeysConstant attribute) ImageDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.image_deduplicator) imagehash (data_juicer.utils.constant.HashKeys attribute) in_context_influence (data_juicer.utils.constant.StatsKeysConstant attribute) indices (data_juicer.utils.video_utils.Frames attribute) inference() (in module data_juicer.ops.common.dwpose_func) inference_detector() (in module data_juicer.ops.common.dwpose_func) inference_pose() (in module data_juicer.ops.common.dwpose_func) init_configs() (in module data_juicer.config) (in module data_juicer.config.config) init_setup_from_cfg() (in module data_juicer.config.config) init_spark() (in module data_juicer.tools.quality_classifier.qc_utils) initialize_ray() (in module data_juicer.utils.ray_utils) input (data_juicer.utils.constant.JobRequiredKeys attribute) insert_texts_after_placeholders() (in module data_juicer.utils.mm_utils) InterVars (class in data_juicer.utils.constant) iou() (in module data_juicer.utils.mm_utils) is_absolute_path() (in module data_juicer.utils.file_utils) is_available() (data_juicer.utils.video_utils.AVReader class method) (data_juicer.utils.video_utils.DecordReader class method) (data_juicer.utils.video_utils.FFmpegReader class method) (data_juicer.utils.video_utils.VideoReader class method) is_batched_op() (data_juicer.ops.base_op.OP method) is_cuda_available() (in module data_juicer.utils.resource_utils) is_float() (in module data_juicer.utils.common_utils) is_notebook() (in module data_juicer.utils.logger_utils) is_numeric_list_series() (in module data_juicer.analysis.correlation_analysis) is_ray_mode() (in module data_juicer.utils.ray_utils) is_remote_path() (in module data_juicer.utils.file_utils) is_string_list() (in module data_juicer.utils.common_utils) is_unique (data_juicer.utils.constant.HashKeys attribute) is_unique() (data_juicer.ops.deduplicator.ray_basic_deduplicator.ActorBackend method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.Backend method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.DedupSet method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.RedisBackend method) isatty() (data_juicer.utils.logger_utils.StreamToLoguru method) iterate() (data_juicer.download.downloader.DocumentIterator method) (data_juicer.download.wikipedia.WikipediaIterator method) J JobRequiredKeys (class in data_juicer.utils.constant) JSDivMeasure (class in data_juicer.analysis.measure) JsonFormatter (class in data_juicer.format) (class in data_juicer.format.json_formatter) K keyword (data_juicer.utils.constant.MetaKeys attribute) KiB (data_juicer.utils.file_utils.Sizes attribute) KLDivMeasure (class in data_juicer.analysis.measure) L lang (data_juicer.utils.constant.StatsKeysConstant attribute) lang_score (data_juicer.utils.constant.StatsKeysConstant attribute) LazyLoader (class in data_juicer.utils.lazy_loader) lines (data_juicer.utils.constant.InterVars attribute) list() (data_juicer.utils.registry.Registry method) llm_analysis_record (data_juicer.utils.constant.StatsKeysConstant attribute) llm_analysis_score (data_juicer.utils.constant.StatsKeysConstant attribute) llm_difficulty_record (data_juicer.utils.constant.StatsKeysConstant attribute) llm_difficulty_score (data_juicer.utils.constant.StatsKeysConstant attribute) llm_perplexity (data_juicer.utils.constant.StatsKeysConstant attribute) llm_quality_record (data_juicer.utils.constant.StatsKeysConstant attribute) llm_quality_score (data_juicer.utils.constant.StatsKeysConstant attribute) llm_task_relevance (data_juicer.utils.constant.StatsKeysConstant attribute) llm_task_relevance_record (data_juicer.utils.constant.StatsKeysConstant attribute) load_audio() (in module data_juicer.utils.mm_utils) load_audios() (in module data_juicer.utils.mm_utils) load_ckpt() (data_juicer.utils.ckpt_utils.CheckpointManager method) load_custom_operators() (in module data_juicer.config.config) load_data_with_context() (in module data_juicer.utils.mm_utils) load_dataset() (data_juicer.format.empty_formatter.EmptyFormatter method) (data_juicer.format.empty_formatter.RayEmptyFormatter method) (data_juicer.format.EmptyFormatter method) (data_juicer.format.formatter.BaseFormatter method) (data_juicer.format.formatter.LocalFormatter method) (data_juicer.format.formatter.RemoteFormatter method) (data_juicer.format.LocalFormatter method) (data_juicer.format.RayEmptyFormatter method) (data_juicer.format.RemoteFormatter method) (data_juicer.format.text_formatter.TextFormatter method) (data_juicer.format.TextFormatter method) (in module data_juicer.tools.quality_classifier.qc_utils) load_datasets() (in module data_juicer.tools.quality_classifier.qc_utils) load_formatter() (in module data_juicer.format.load) load_image() (in module data_juicer.utils.mm_utils) load_image_byte() (in module data_juicer.utils.mm_utils) load_images() (in module data_juicer.utils.mm_utils) load_images_byte() (in module data_juicer.utils.mm_utils) load_mm_bytes_from_sample() (in module data_juicer.utils.mm_utils) load_ops() (in module data_juicer.ops.load) load_ops_with_stats_meta() (in module data_juicer.config.config) load_video() (in module data_juicer.utils.mm_utils) load_videos() (in module data_juicer.utils.mm_utils) load_words_asset() (in module data_juicer.utils.asset_utils) loaded_audios (data_juicer.utils.constant.InterVars attribute) loaded_images (data_juicer.utils.constant.InterVars attribute) loaded_videos (data_juicer.utils.constant.InterVars attribute) local (data_juicer.utils.constant.JobRequiredKeys attribute) LocalBlend (class in data_juicer.ops.common.prompt2prompt_pipeline) LocalFormatter (class in data_juicer.format) (class in data_juicer.format.formatter) Lz4Compressor (class in data_juicer.utils.compress) M main() (in module data_juicer.tools.mcp_server) (in module data_juicer.tools.quality_classifier.eval) (in module data_juicer.tools.quality_classifier.train) main_entities (data_juicer.utils.constant.MetaKeys attribute) make_log_summarization() (in module data_juicer.utils.logger_utils) Mapper (class in data_juicer.ops.base_op) max_line_length (data_juicer.utils.constant.StatsKeysConstant attribute) Measure (class in data_juicer.analysis.measure) measure() (data_juicer.analysis.measure.CrossEntropyMeasure method) (data_juicer.analysis.measure.EntropyMeasure method) (data_juicer.analysis.measure.JSDivMeasure method) (data_juicer.analysis.measure.KLDivMeasure method) (data_juicer.analysis.measure.Measure method) (data_juicer.analysis.measure.RelatedTTestMeasure method) merge() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method) merge_config() (in module data_juicer.config) (in module data_juicer.config.config) merge_on_whitespace_tab_newline() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) merge_op_batch() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method) meta (data_juicer.utils.constant.Fields attribute) meta_map() (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator method) (data_juicer.ops.aggregator.MetaTagsAggregator method) meta_name (data_juicer.utils.constant.JobRequiredKeys attribute) metadata (data_juicer.utils.video_utils.VideoReader property) MetaKeys (class in data_juicer.utils.constant) MetaTagsAggregator (class in data_juicer.ops.aggregator) (class in data_juicer.ops.aggregator.meta_tags_aggregator) MiB (data_juicer.utils.file_utils.Sizes attribute) minhash (data_juicer.utils.constant.HashKeys attribute) mis_match_char() (data_juicer.ops.common.prompt2prompt_pipeline.ScoreParams method) module data_juicer data_juicer.analysis data_juicer.analysis.column_wise_analysis data_juicer.analysis.correlation_analysis data_juicer.analysis.diversity_analysis data_juicer.analysis.measure data_juicer.analysis.overall_analysis data_juicer.config data_juicer.config.config data_juicer.core.monitor data_juicer.download data_juicer.download.commoncrawl data_juicer.download.downloader data_juicer.download.wikipedia data_juicer.format data_juicer.format.csv_formatter data_juicer.format.empty_formatter data_juicer.format.formatter data_juicer.format.json_formatter data_juicer.format.load data_juicer.format.parquet_formatter data_juicer.format.text_formatter data_juicer.format.tsv_formatter data_juicer.ops.aggregator data_juicer.ops.aggregator.entity_attribute_aggregator data_juicer.ops.aggregator.meta_tags_aggregator data_juicer.ops.aggregator.most_relevant_entities_aggregator data_juicer.ops.aggregator.nested_aggregator data_juicer.ops.base_op data_juicer.ops.common data_juicer.ops.common.dwpose_func data_juicer.ops.common.helper_func data_juicer.ops.common.prompt2prompt_pipeline data_juicer.ops.common.special_characters data_juicer.ops.deduplicator data_juicer.ops.deduplicator.document_deduplicator data_juicer.ops.deduplicator.document_minhash_deduplicator data_juicer.ops.deduplicator.document_simhash_deduplicator data_juicer.ops.deduplicator.image_deduplicator data_juicer.ops.deduplicator.ray_basic_deduplicator data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator data_juicer.ops.deduplicator.ray_document_deduplicator data_juicer.ops.deduplicator.ray_image_deduplicator data_juicer.ops.deduplicator.ray_video_deduplicator data_juicer.ops.deduplicator.video_deduplicator data_juicer.ops.filter.alphanumeric_filter data_juicer.ops.filter.audio_duration_filter data_juicer.ops.load data_juicer.ops.op_fusion data_juicer.tools data_juicer.tools.hpo data_juicer.tools.mcp_server data_juicer.tools.quality_classifier data_juicer.tools.quality_classifier.eval data_juicer.tools.quality_classifier.predict data_juicer.tools.quality_classifier.qc_utils data_juicer.tools.quality_classifier.train data_juicer.utils data_juicer.utils.asset_utils data_juicer.utils.availability_utils data_juicer.utils.cache_utils data_juicer.utils.ckpt_utils data_juicer.utils.common_utils data_juicer.utils.compress data_juicer.utils.constant data_juicer.utils.file_utils data_juicer.utils.fingerprint_utils data_juicer.utils.lazy_loader data_juicer.utils.logger_utils data_juicer.utils.mm_utils data_juicer.utils.model_utils data_juicer.utils.nltk_utils data_juicer.utils.process_utils data_juicer.utils.ray_utils data_juicer.utils.registry data_juicer.utils.resource_utils data_juicer.utils.s3_utils data_juicer.utils.sample data_juicer.utils.video_utils data_juicer.utils.webdataset_utils modules (data_juicer.utils.registry.Registry property) Monitor (class in data_juicer.core.monitor) monitor_current_resources() (data_juicer.core.monitor.Monitor static method) monitor_func() (data_juicer.core.monitor.Monitor static method) most_relevant_entities (data_juicer.utils.constant.BatchMetaKeys attribute) MostRelevantEntitiesAggregator (class in data_juicer.ops.aggregator) (class in data_juicer.ops.aggregator.most_relevant_entities_aggregator) multiclass_nms() (in module data_juicer.ops.common.dwpose_func) multimodal_data_output_dir (data_juicer.utils.constant.Fields attribute) N name (data_juicer.analysis.measure.CrossEntropyMeasure attribute) (data_juicer.analysis.measure.EntropyMeasure attribute) (data_juicer.analysis.measure.JSDivMeasure attribute) (data_juicer.analysis.measure.KLDivMeasure attribute) (data_juicer.analysis.measure.Measure attribute) (data_juicer.analysis.measure.RelatedTTestMeasure attribute) (data_juicer.utils.registry.Registry property) namespace_to_arg_list() (in module data_juicer.config.config) nested_access() (in module data_juicer.utils.common_utils) NestedAggregator (class in data_juicer.ops.aggregator) (class in data_juicer.ops.aggregator.nested_aggregator) nickname (data_juicer.utils.constant.MetaKeys attribute) nms() (in module data_juicer.ops.common.dwpose_func) npmax() (in module data_juicer.ops.common.dwpose_func) null_value (data_juicer.format.empty_formatter.EmptyFormatter property) (data_juicer.format.empty_formatter.RayEmptyFormatter property) (data_juicer.format.EmptyFormatter property) (data_juicer.format.RayEmptyFormatter property) num_action (data_juicer.utils.constant.StatsKeysConstant attribute) num_dependency_edges (data_juicer.utils.constant.StatsKeysConstant attribute) num_frames (data_juicer.utils.video_utils.VideoMetadata attribute) num_token (data_juicer.utils.constant.StatsKeysConstant attribute) num_uncond_att_layers (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControl property) num_words (data_juicer.utils.constant.StatsKeysConstant attribute) O OP (class in data_juicer.ops.base_op) optimal_param() (in module data_juicer.ops.deduplicator.document_minhash_deduplicator) output (data_juicer.utils.constant.JobRequiredKeys attribute) OverallAnalysis (class in data_juicer.analysis) (class in data_juicer.analysis.overall_analysis) P P2PCrossAttnProcessor (class in data_juicer.ops.common.prompt2prompt_pipeline) padRightDownCorner() (in module data_juicer.ops.common.dwpose_func) ParquetFormatter (class in data_juicer.format) (class in data_juicer.format.parquet_formatter) parse_output() (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator method) (data_juicer.ops.aggregator.EntityAttributeAggregator method) (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator method) (data_juicer.ops.aggregator.MetaTagsAggregator method) (data_juicer.ops.aggregator.most_relevant_entities_aggregator.MostRelevantEntitiesAggregator method) (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator method) (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator method) (data_juicer.ops.aggregator.NestedAggregator method) parse_string_to_roi() (in module data_juicer.utils.mm_utils) patch_nltk_pickle_security() (in module data_juicer.utils.nltk_utils) path (data_juicer.utils.video_utils.Clip attribute) perplexity (data_juicer.utils.constant.StatsKeysConstant attribute) phrase_grounding_recall (data_juicer.utils.constant.StatsKeysConstant attribute) pil_to_opencv() (in module data_juicer.utils.mm_utils) pose_estimation_tags (data_juicer.utils.constant.MetaKeys attribute) postprocess() (in module data_juicer.ops.common.dwpose_func) predict() (in module data_juicer.tools.quality_classifier.qc_utils) predict_score() (in module data_juicer.tools.quality_classifier.predict) prepare_api_model() (in module data_juicer.utils.model_utils) prepare_cfgs_for_export() (in module data_juicer.config) (in module data_juicer.config.config) prepare_diffusion_model() (in module data_juicer.utils.model_utils) prepare_dwpose_model() (in module data_juicer.utils.model_utils) prepare_embedding_model() (in module data_juicer.utils.model_utils) prepare_fastsam_model() (in module data_juicer.utils.model_utils) prepare_fasttext_model() (in module data_juicer.utils.model_utils) prepare_huggingface_model() (in module data_juicer.utils.model_utils) prepare_kenlm_model() (in module data_juicer.utils.model_utils) prepare_model() (in module data_juicer.tools.quality_classifier.qc_utils) (in module data_juicer.utils.model_utils) prepare_nltk_model() (in module data_juicer.utils.model_utils) prepare_nltk_pos_tagger() (in module data_juicer.utils.model_utils) prepare_opencv_classifier() (in module data_juicer.utils.model_utils) prepare_qwen_vl_inputs_for_vllm() (in module data_juicer.utils.model_utils) prepare_recognizeAnything_model() (in module data_juicer.utils.model_utils) prepare_sdxl_prompt2prompt() (in module data_juicer.utils.model_utils) prepare_sentencepiece_for_lang() (in module data_juicer.utils.model_utils) prepare_sentencepiece_model() (in module data_juicer.utils.model_utils) prepare_side_configs() (in module data_juicer.config) (in module data_juicer.config.config) prepare_simple_aesthetics_model() (in module data_juicer.utils.model_utils) prepare_spacy_model() (in module data_juicer.utils.model_utils) prepare_vggt_model() (in module data_juicer.utils.model_utils) prepare_video_blip_model() (in module data_juicer.utils.model_utils) prepare_video_depth_anything() (in module data_juicer.utils.model_utils) prepare_vllm_model() (in module data_juicer.utils.model_utils) prepare_wilor_model() (in module data_juicer.utils.model_utils) prepare_yolo_model() (in module data_juicer.utils.model_utils) preprocess_det() (in module data_juicer.ops.common.dwpose_func) preprocess_pose() (in module data_juicer.ops.common.dwpose_func) process() (data_juicer.ops.base_op.Deduplicator method) (data_juicer.ops.base_op.Grouper method) (data_juicer.ops.base_op.OP method) (data_juicer.ops.base_op.Selector method) (data_juicer.ops.deduplicator.document_deduplicator.DocumentDeduplicator method) (data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicator method) (data_juicer.ops.deduplicator.document_minhash_deduplicator.DocumentMinhashDeduplicatorWithUid method) (data_juicer.ops.deduplicator.document_simhash_deduplicator.DocumentSimhashDeduplicator method) (data_juicer.ops.deduplicator.DocumentDeduplicator method) (data_juicer.ops.deduplicator.DocumentMinhashDeduplicator method) (data_juicer.ops.deduplicator.DocumentMinhashDeduplicatorWithUid method) (data_juicer.ops.deduplicator.DocumentSimhashDeduplicator method) (data_juicer.ops.deduplicator.image_deduplicator.ImageDeduplicator method) (data_juicer.ops.deduplicator.ImageDeduplicator method) (data_juicer.ops.deduplicator.video_deduplicator.VideoDeduplicator method) (data_juicer.ops.deduplicator.VideoDeduplicator method) process_batched() (data_juicer.ops.base_op.Filter method) (data_juicer.ops.base_op.Mapper method) (data_juicer.ops.filter.alphanumeric_filter.AlphanumericFilter method) (data_juicer.ops.op_fusion.FusedFilter method) (data_juicer.ops.op_fusion.GeneralFusedOP method) process_each_frame() (in module data_juicer.utils.mm_utils) process_single() (data_juicer.ops.aggregator.entity_attribute_aggregator.EntityAttributeAggregator method) (data_juicer.ops.aggregator.EntityAttributeAggregator method) (data_juicer.ops.aggregator.meta_tags_aggregator.MetaTagsAggregator method) (data_juicer.ops.aggregator.MetaTagsAggregator method) (data_juicer.ops.aggregator.most_relevant_entities_aggregator.MostRelevantEntitiesAggregator method) (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator method) (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator method) (data_juicer.ops.aggregator.NestedAggregator method) (data_juicer.ops.base_op.Aggregator method) (data_juicer.ops.base_op.Filter method) (data_juicer.ops.base_op.Mapper method) (data_juicer.ops.deduplicator.ray_basic_deduplicator.RayBasicDeduplicator method) (data_juicer.ops.deduplicator.RayBasicDeduplicator method) (data_juicer.ops.filter.audio_duration_filter.AudioDurationFilter method) Prompt2PromptPipeline (class in data_juicer.ops.common.prompt2prompt_pipeline) pts_time (data_juicer.utils.video_utils.Frames attribute) Q query_cuda_info() (in module data_juicer.utils.resource_utils) query_intent_label (data_juicer.utils.constant.MetaKeys attribute) query_intent_score (data_juicer.utils.constant.MetaKeys attribute) query_mem_info() (in module data_juicer.utils.resource_utils) query_most_relevant_entities() (data_juicer.ops.aggregator.most_relevant_entities_aggregator.MostRelevantEntitiesAggregator method) (data_juicer.ops.aggregator.MostRelevantEntitiesAggregator method) query_sentiment_label (data_juicer.utils.constant.MetaKeys attribute) query_sentiment_score (data_juicer.utils.constant.MetaKeys attribute) query_topic_label (data_juicer.utils.constant.MetaKeys attribute) query_topic_score (data_juicer.utils.constant.MetaKeys attribute) R random_sample() (in module data_juicer.utils.sample) ray_available_gpu_memories() (in module data_juicer.utils.ray_utils) ray_available_memories() (in module data_juicer.utils.ray_utils) ray_cpu_count() (in module data_juicer.utils.ray_utils) ray_gpu_count() (in module data_juicer.utils.ray_utils) ray_gpu_memories() (in module data_juicer.utils.ray_utils) RayBasicDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.ray_basic_deduplicator) RayBTSMinhashDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator) RayBTSMinhashDeduplicatorWithUid (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator) RayDocumentDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.ray_document_deduplicator) RayEmptyFormatter (class in data_juicer.format) (class in data_juicer.format.empty_formatter) RayImageDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.ray_image_deduplicator) RayVideoDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.ray_video_deduplicator) read_file_as_bytes() (in module data_juicer.utils.webdataset_utils) read_single_partition() (in module data_juicer.utils.file_utils) rebalancing() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) reconstruct_custom_webdataset_format() (in module data_juicer.utils.webdataset_utils) record() (data_juicer.utils.ckpt_utils.CheckpointManager method) recursive_summary() (data_juicer.ops.aggregator.nested_aggregator.NestedAggregator method) (data_juicer.ops.aggregator.NestedAggregator method) redirect_sys_output() (in module data_juicer.utils.logger_utils) RedisBackend (class in data_juicer.ops.deduplicator.ray_basic_deduplicator) refine_single_column() (data_juicer.analysis.overall_analysis.OverallAnalysis method) (data_juicer.analysis.OverallAnalysis method) refined_words (data_juicer.utils.constant.InterVars attribute) register_attention_control() (data_juicer.ops.common.prompt2prompt_pipeline.Prompt2PromptPipeline method) register_module() (data_juicer.utils.registry.Registry method) Registry (class in data_juicer.utils.registry) RelatedTTestMeasure (class in data_juicer.analysis.measure) relation (data_juicer.utils.constant.MetaKeys attribute) relation_description (data_juicer.utils.constant.MetaKeys attribute) relation_keywords (data_juicer.utils.constant.MetaKeys attribute) relation_strength (data_juicer.utils.constant.MetaKeys attribute) relevant_characters (data_juicer.utils.constant.MetaKeys attribute) RemoteFormatter (class in data_juicer.format) (class in data_juicer.format.formatter) remove_extra_parameters() (data_juicer.ops.base_op.OP method) remove_non_special_tokens() (in module data_juicer.utils.mm_utils) remove_special_tokens() (in module data_juicer.utils.mm_utils) replace_cross_attention() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControlEdit method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionRefine method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionReplace method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionReweight method) replace_self_attention() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControlEdit method) rescale_noise_cfg() (in module data_juicer.ops.common.prompt2prompt_pipeline) reset() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControl method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionStore method) reset_dependencies_cache() (data_juicer.utils.lazy_loader.LazyLoader class method) resource_monitor() (in module data_juicer.core.monitor) role_relation (data_juicer.utils.constant.MetaKeys attribute) run() (data_juicer.ops.base_op.Aggregator method) (data_juicer.ops.base_op.Deduplicator method) (data_juicer.ops.base_op.Filter method) (data_juicer.ops.base_op.Grouper method) (data_juicer.ops.base_op.Mapper method) (data_juicer.ops.base_op.OP method) (data_juicer.ops.base_op.Selector method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.RayBTSMinhashDeduplicatorWithUid method) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicator method) (data_juicer.ops.deduplicator.RayBTSMinhashDeduplicatorWithUid method) (data_juicer.ops.op_fusion.GeneralFusedOP method) runtime_np() (data_juicer.ops.base_op.OP method) S sampled_frames (data_juicer.utils.constant.InterVars attribute) save_ckpt() (data_juicer.utils.ckpt_utils.CheckpointManager method) ScoreParams (class in data_juicer.ops.common.prompt2prompt_pipeline) Selector (class in data_juicer.ops.base_op) set_edge_buffer() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) set_edges() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.EdgeBuffer method) setup_logger() (in module data_juicer.utils.logger_utils) setup_mp() (in module data_juicer.utils.process_utils) setup_resource_aliases() (in module data_juicer.utils.nltk_utils) sha1_hash32() (in module data_juicer.ops.deduplicator.document_minhash_deduplicator) shuffle() (in module data_juicer.tools.quality_classifier.qc_utils) simhash (data_juicer.utils.constant.HashKeys attribute) single_partition_write_with_filename() (in module data_juicer.utils.file_utils) size_to_bytes() (in module data_juicer.utils.mm_utils) Sizes (class in data_juicer.utils.file_utils) smart_resize() (in module data_juicer.ops.common.dwpose_func) smart_resize_k() (in module data_juicer.ops.common.dwpose_func) sort_op_by_types_and_names() (in module data_juicer.config.config) source_entity (data_juicer.utils.constant.MetaKeys attribute) source_file (data_juicer.utils.constant.Fields attribute) source_video (data_juicer.utils.video_utils.Clip attribute) span (data_juicer.utils.video_utils.Clip attribute) special_char_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) SpecialTokens (class in data_juicer.utils.mm_utils) split_on_newline_tab_whitespace() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) split_on_whitespace() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) split_text_by_punctuation() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) squeeze() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) stats (data_juicer.utils.constant.Fields attribute) stats_to_hist() (data_juicer.analysis.measure.RelatedTTestMeasure static method) stats_to_number() (in module data_juicer.utils.common_utils) StatsKeys (class in data_juicer.utils.constant) StatsKeysConstant (class in data_juicer.utils.constant) StatsKeysMeta (class in data_juicer.utils.constant) step_callback() (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControl method) (data_juicer.ops.common.prompt2prompt_pipeline.AttentionControlEdit method) stopwords_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) StreamToLoguru (class in data_juicer.utils.logger_utils) strip() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) suffix (data_juicer.utils.constant.Fields attribute) SUFFIXES (data_juicer.format.csv_formatter.CsvFormatter attribute) (data_juicer.format.CsvFormatter attribute) (data_juicer.format.empty_formatter.EmptyFormatter attribute) (data_juicer.format.empty_formatter.RayEmptyFormatter attribute) (data_juicer.format.EmptyFormatter attribute) (data_juicer.format.json_formatter.JsonFormatter attribute) (data_juicer.format.JsonFormatter attribute) (data_juicer.format.parquet_formatter.ParquetFormatter attribute) (data_juicer.format.ParquetFormatter attribute) (data_juicer.format.RayEmptyFormatter attribute) (data_juicer.format.text_formatter.TextFormatter attribute) (data_juicer.format.TextFormatter attribute) (data_juicer.format.tsv_formatter.TsvFormatter attribute) (data_juicer.format.TsvFormatter attribute) support_text (data_juicer.utils.constant.MetaKeys attribute) T target_entity (data_juicer.utils.constant.MetaKeys attribute) text_embd_similarity (data_juicer.utils.constant.StatsKeysConstant attribute) text_len (data_juicer.utils.constant.StatsKeysConstant attribute) text_pair_similarity (data_juicer.utils.constant.StatsKeysConstant attribute) text_tags (data_juicer.utils.constant.Fields attribute) TextFormatter (class in data_juicer.format) (class in data_juicer.format.text_formatter) TiB (data_juicer.utils.file_utils.Sizes attribute) timecode_string_to_seconds() (in module data_juicer.utils.mm_utils) timing_context() (in module data_juicer.config.config) tokenize_dataset() (in module data_juicer.tools.quality_classifier.qc_utils) top_down_affine() (in module data_juicer.ops.common.dwpose_func) train() (in module data_juicer.tools.quality_classifier.qc_utils) transfer() (in module data_juicer.ops.common.dwpose_func) transfer_data_dir() (in module data_juicer.utils.file_utils) transfer_filename() (in module data_juicer.utils.file_utils) TsvFormatter (class in data_juicer.format) (class in data_juicer.format.tsv_formatter) U uid (data_juicer.utils.constant.HashKeys attribute) unify_format() (in module data_juicer.format.formatter) union() (data_juicer.ops.common.helper_func.UnionFind method) (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) union_list() (data_juicer.ops.deduplicator.ray_bts_minhash_deduplicator.BTSUnionFind method) UnionFind (class in data_juicer.ops.common.helper_func) update() (data_juicer.utils.fingerprint_utils.Hasher method) update_alpha_time_word() (in module data_juicer.ops.common.prompt2prompt_pipeline) update_ds_cache_dir_and_related_vars() (in module data_juicer.config.config) update_fingerprint() (in module data_juicer.utils.fingerprint_utils) update_op_attr() (in module data_juicer.config) (in module data_juicer.config.config) update_op_process() (in module data_juicer.config.config) update_sampling_params() (in module data_juicer.utils.model_utils) use_auto_proc() (data_juicer.ops.base_op.OP method) use_cuda() (data_juicer.ops.base_op.OP method) use_ray_actor() (data_juicer.ops.base_op.OP method) V validate_s3_path() (in module data_juicer.utils.s3_utils) validate_snapshot_format() (in module data_juicer.download.downloader) vggt_tags (data_juicer.utils.constant.MetaKeys attribute) video (data_juicer.utils.mm_utils.SpecialTokens attribute) video_aesthetic_score (data_juicer.utils.constant.StatsKeysConstant attribute) video_aspect_ratios (data_juicer.utils.constant.StatsKeysConstant attribute) video_audio_tags (data_juicer.utils.constant.MetaKeys attribute) video_depth_tags (data_juicer.utils.constant.MetaKeys attribute) video_duration (data_juicer.utils.constant.StatsKeysConstant attribute) video_frame_tags (data_juicer.utils.constant.MetaKeys attribute) video_frames (data_juicer.utils.constant.MetaKeys attribute) video_frames_aesthetics_score (data_juicer.utils.constant.StatsKeysConstant attribute) video_frames_text_similarity (data_juicer.utils.constant.StatsKeysConstant attribute) video_height (data_juicer.utils.constant.StatsKeysConstant attribute) video_motion_score (data_juicer.utils.constant.StatsKeysConstant attribute) video_nsfw_score (data_juicer.utils.constant.StatsKeysConstant attribute) video_object_segment_tags (data_juicer.utils.constant.MetaKeys attribute) video_ocr_area_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) video_watermark_prob (data_juicer.utils.constant.StatsKeysConstant attribute) video_width (data_juicer.utils.constant.StatsKeysConstant attribute) VideoDeduplicator (class in data_juicer.ops.deduplicator) (class in data_juicer.ops.deduplicator.video_deduplicator) videohash (data_juicer.utils.constant.HashKeys attribute) VideoMetadata (class in data_juicer.utils.video_utils) VideoReader (class in data_juicer.utils.video_utils) W Wholebody (class in data_juicer.ops.common.dwpose_func) width (data_juicer.utils.video_utils.VideoMetadata attribute) WikipediaDownloader (class in data_juicer.download.wikipedia) WikipediaExtractor (class in data_juicer.download.wikipedia) WikipediaIterator (class in data_juicer.download.wikipedia) word_rep_ratio (data_juicer.utils.constant.StatsKeysConstant attribute) words (data_juicer.utils.constant.InterVars attribute) words_augmentation() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) words_refinement() (in module data_juicer.ops.common) (in module data_juicer.ops.common.helper_func) write() (data_juicer.utils.logger_utils.StreamToLoguru method) Z ZstdCompressor (class in data_juicer.utils.compress)