data_juicer.utils.constant module#
- class data_juicer.utils.constant.Fields[源代码]#
基类:
object- stats = '__dj__stats__'#
- meta = '__dj__meta__'#
- batch_meta = '__dj__batch_meta__'#
- context = '__dj__context__'#
- suffix = '__dj__suffix__'#
- source_file = '__dj__source_file__'#
- multimodal_data_output_dir = '__dj__produced_data__'#
- class data_juicer.utils.constant.BatchMetaKeys[源代码]#
基类:
object- entity_attribute = 'entity_attribute'#
- most_relevant_entities = 'most_relevant_entities'#
- class data_juicer.utils.constant.MetaKeys[源代码]#
基类:
object- dialog_sentiment_intensity = 'dialog_sentiment_intensity'#
- dialog_sentiment_intensity_analysis = 'dialog_sentiment_intensity_analysis'#
- query_sentiment_label = 'query_sentiment_label'#
- query_sentiment_score = 'query_sentiment_label_score'#
- dialog_sentiment_labels = 'dialog_sentiment_labels'#
- dialog_sentiment_labels_analysis = 'dialog_sentiment_labels_analysis'#
- dialog_intent_labels = 'dialog_intent_labels'#
- dialog_intent_labels_analysis = 'dialog_intent_labels_analysis'#
- query_intent_label = 'query_intent_label'#
- query_intent_score = 'query_intent_label_score'#
- dialog_topic_labels = 'dialog_topic_labels'#
- dialog_topic_labels_analysis = 'dialog_topic_labels_analysis'#
- query_topic_label = 'query_topic_label'#
- query_topic_score = 'query_topic_label_score'#
- video_frame_tags = 'video_frame_tags'#
- video_audio_tags = 'video_audio_tags'#
- video_frames = 'video_frames'#
- image_tags = 'image_tags'#
- bbox_tag = '__dj__bbox__'#
- event_description = 'event_description'#
- relevant_characters = 'relevant_characters'#
- main_entities = 'main_entities'#
- attributes = 'attributes'#
- attribute_descriptions = 'attribute_descriptions'#
- attribute_support_texts = 'attribute_support_texts'#
- nickname = 'nickname'#
- entity = 'entity'#
- entity_name = 'entity_name'#
- entity_type = 'entity_type'#
- entity_description = 'entity_entity_description'#
- relation = 'relation'#
- source_entity = 'relation_source_entity'#
- target_entity = 'relation_target_entity'#
- relation_description = 'relation_description'#
- relation_keywords = 'relation_keywords'#
- relation_strength = 'relation_strength'#
- keyword = 'keyword'#
- support_text = 'support_text'#
- role_relation = 'role_relation'#
- html_tables = 'html_tables'#
- class data_juicer.utils.constant.StatsKeysMeta[源代码]#
基类:
typea helper class to track the mapping from OP's name to its used stats_keys
e.g., # once the AlphanumericFilter's compute_stats method has been called res = TrackingDescriptor.get_access_log() print(res) # {"AlphanumericFilter": ["alnum_ratio", "alpha_token_ratio"]}
- class data_juicer.utils.constant.StatsKeysConstant[源代码]#
基类:
object- alpha_token_ratio = 'alpha_token_ratio'#
- alnum_ratio = 'alnum_ratio'#
- avg_line_length = 'avg_line_length'#
- char_rep_ratio = 'char_rep_ratio'#
- flagged_words_ratio = 'flagged_words_ratio'#
- lang = 'lang'#
- lang_score = 'lang_score'#
- max_line_length = 'max_line_length'#
- perplexity = 'perplexity'#
- special_char_ratio = 'special_char_ratio'#
- stopwords_ratio = 'stopwords_ratio'#
- text_len = 'text_len'#
- text_pair_similarity = 'text_pair_similarity'#
- num_action = 'num_action'#
- num_dependency_edges = 'num_dependency_edges'#
- num_token = 'num_token'#
- num_words = 'num_words'#
- word_rep_ratio = 'word_rep_ratio'#
- llm_analysis_score = 'llm_analysis_score'#
- llm_analysis_record = 'llm_analysis_record'#
- llm_quality_score = 'llm_quality_score'#
- llm_quality_record = 'llm_quality_record'#
- llm_difficulty_score = 'llm_difficulty_score'#
- llm_difficulty_record = 'llm_difficulty_record'#
- aspect_ratios = 'aspect_ratios'#
- image_width = 'image_width'#
- image_height = 'image_height'#
- image_sizes = 'image_sizes'#
- face_ratios = 'face_ratios'#
- face_detections = 'face_detections'#
- face_counts = 'face_counts'#
- image_aesthetics_scores = 'image_aesthetics_scores'#
- image_nsfw_score = 'image_nsfw_score'#
- image_watermark_prob = 'image_watermark_prob'#
- image_pair_similarity = 'image_pair_similarity'#
- audio_duration = 'audio_duration'#
- audio_nmf_snr = 'audio_nmf_snr'#
- audio_sizes = 'audio_sizes'#
- video_duration = 'video_duration'#
- video_aspect_ratios = 'video_aspect_ratios'#
- video_width = 'video_width'#
- video_height = 'video_height'#
- video_ocr_area_ratio = 'video_ocr_area_ratio'#
- video_aesthetic_score = 'video_aesthetic_score'#
- video_frames_aesthetics_score = 'video_frames_aesthetics_score'#
- video_motion_score = 'video_motion_score'#
- video_nsfw_score = 'video_nsfw_score'#
- video_watermark_prob = 'video_watermark_prob'#
- image_text_similarity = 'image_text_similarity'#
- image_text_matching_score = 'image_text_matching_score'#
- phrase_grounding_recall = 'phrase_grounding_recall'#
- video_frames_text_similarity = 'video_frames_text_similarity'#
- general_field_filter_condition = 'general_field_filter_condition'#
- class data_juicer.utils.constant.HashKeys[源代码]#
基类:
object- uid = '__dj__uid'#
- hash = '__dj__hash'#
- minhash = '__dj__minhash'#
- simhash = '__dj__simhash'#
- imagehash = '__dj__imagehash'#
- videohash = '__dj__videohash'#
- is_unique = '__dj__is_unique'#
- class data_juicer.utils.constant.InterVars[源代码]#
基类:
object- lines = '__dj__lines'#
- words = '__dj__words'#
- refined_words = '__dj__refined_words'#
- loaded_images = '__dj__loaded_images'#
- loaded_audios = '__dj__loaded_audios'#
- loaded_videos = '__dj__loaded_videos'#
- sampled_frames = '__dj__sampled_frames'#