data_juicer.utils.constant module#

class data_juicer.utils.constant.Fields[source]#

Bases: object

stats = '__dj__stats__'#
meta = '__dj__meta__'#
batch_meta = '__dj__batch_meta__'#
context = '__dj__context__'#
suffix = '__dj__suffix__'#
source_file = '__dj__source_file__'#
multimodal_data_output_dir = '__dj__produced_data__'#
class data_juicer.utils.constant.BatchMetaKeys[source]#

Bases: object

entity_attribute = 'entity_attribute'#
most_relevant_entities = 'most_relevant_entities'#
class data_juicer.utils.constant.MetaKeys[source]#

Bases: object

dialog_sentiment_intensity = 'dialog_sentiment_intensity'#
dialog_sentiment_intensity_analysis = 'dialog_sentiment_intensity_analysis'#
query_sentiment_label = 'query_sentiment_label'#
query_sentiment_score = 'query_sentiment_label_score'#
dialog_sentiment_labels = 'dialog_sentiment_labels'#
dialog_sentiment_labels_analysis = 'dialog_sentiment_labels_analysis'#
dialog_intent_labels = 'dialog_intent_labels'#
dialog_intent_labels_analysis = 'dialog_intent_labels_analysis'#
query_intent_label = 'query_intent_label'#
query_intent_score = 'query_intent_label_score'#
dialog_topic_labels = 'dialog_topic_labels'#
dialog_topic_labels_analysis = 'dialog_topic_labels_analysis'#
query_topic_label = 'query_topic_label'#
query_topic_score = 'query_topic_label_score'#
video_frame_tags = 'video_frame_tags'#
video_audio_tags = 'video_audio_tags'#
video_frames = 'video_frames'#
image_tags = 'image_tags'#
bbox_tag = '__dj__bbox__'#
event_description = 'event_description'#
relevant_characters = 'relevant_characters'#
main_entities = 'main_entities'#
attributes = 'attributes'#
attribute_descriptions = 'attribute_descriptions'#
attribute_support_texts = 'attribute_support_texts'#
nickname = 'nickname'#
entity = 'entity'#
entity_name = 'entity_name'#
entity_type = 'entity_type'#
entity_description = 'entity_entity_description'#
relation = 'relation'#
source_entity = 'relation_source_entity'#
target_entity = 'relation_target_entity'#
relation_description = 'relation_description'#
relation_keywords = 'relation_keywords'#
relation_strength = 'relation_strength'#
keyword = 'keyword'#
support_text = 'support_text'#
role_relation = 'role_relation'#
html_tables = 'html_tables'#
class data_juicer.utils.constant.StatsKeysMeta[source]#

Bases: type

a helper class to track the mapping from OP’s name to its used stats_keys

e.g., # once the AlphanumericFilter’s compute_stats method has been called res = TrackingDescriptor.get_access_log() print(res) # {“AlphanumericFilter”: [“alnum_ratio”, “alpha_token_ratio”]}

get_access_log(dj_cfg=None, dataset=None)[source]#
class data_juicer.utils.constant.StatsKeysConstant[source]#

Bases: object

alpha_token_ratio = 'alpha_token_ratio'#
alnum_ratio = 'alnum_ratio'#
avg_line_length = 'avg_line_length'#
char_rep_ratio = 'char_rep_ratio'#
flagged_words_ratio = 'flagged_words_ratio'#
lang = 'lang'#
lang_score = 'lang_score'#
max_line_length = 'max_line_length'#
perplexity = 'perplexity'#
special_char_ratio = 'special_char_ratio'#
stopwords_ratio = 'stopwords_ratio'#
text_len = 'text_len'#
text_pair_similarity = 'text_pair_similarity'#
num_action = 'num_action'#
num_dependency_edges = 'num_dependency_edges'#
num_token = 'num_token'#
num_words = 'num_words'#
word_rep_ratio = 'word_rep_ratio'#
llm_analysis_score = 'llm_analysis_score'#
llm_analysis_record = 'llm_analysis_record'#
llm_quality_score = 'llm_quality_score'#
llm_quality_record = 'llm_quality_record'#
llm_difficulty_score = 'llm_difficulty_score'#
llm_difficulty_record = 'llm_difficulty_record'#
aspect_ratios = 'aspect_ratios'#
image_width = 'image_width'#
image_height = 'image_height'#
image_sizes = 'image_sizes'#
face_ratios = 'face_ratios'#
face_detections = 'face_detections'#
face_counts = 'face_counts'#
image_aesthetics_scores = 'image_aesthetics_scores'#
image_nsfw_score = 'image_nsfw_score'#
image_watermark_prob = 'image_watermark_prob'#
image_pair_similarity = 'image_pair_similarity'#
audio_duration = 'audio_duration'#
audio_nmf_snr = 'audio_nmf_snr'#
audio_sizes = 'audio_sizes'#
video_duration = 'video_duration'#
video_aspect_ratios = 'video_aspect_ratios'#
video_width = 'video_width'#
video_height = 'video_height'#
video_ocr_area_ratio = 'video_ocr_area_ratio'#
video_aesthetic_score = 'video_aesthetic_score'#
video_frames_aesthetics_score = 'video_frames_aesthetics_score'#
video_motion_score = 'video_motion_score'#
video_nsfw_score = 'video_nsfw_score'#
video_watermark_prob = 'video_watermark_prob'#
image_text_similarity = 'image_text_similarity'#
image_text_matching_score = 'image_text_matching_score'#
phrase_grounding_recall = 'phrase_grounding_recall'#
video_frames_text_similarity = 'video_frames_text_similarity'#
general_field_filter_condition = 'general_field_filter_condition'#
class data_juicer.utils.constant.StatsKeys[source]#

Bases: object

class data_juicer.utils.constant.HashKeys[source]#

Bases: object

uid = '__dj__uid'#
hash = '__dj__hash'#
minhash = '__dj__minhash'#
simhash = '__dj__simhash'#
imagehash = '__dj__imagehash'#
videohash = '__dj__videohash'#
is_unique = '__dj__is_unique'#
class data_juicer.utils.constant.InterVars[source]#

Bases: object

lines = '__dj__lines'#
words = '__dj__words'#
refined_words = '__dj__refined_words'#
loaded_images = '__dj__loaded_images'#
loaded_audios = '__dj__loaded_audios'#
loaded_videos = '__dj__loaded_videos'#
sampled_frames = '__dj__sampled_frames'#
class data_juicer.utils.constant.JobRequiredKeys(value)[source]#

Bases: Enum

hook = 'hook'#
meta_name = 'meta_name'#
input = 'input'#
output = 'output'#
local = 'local'#
dj_configs = 'dj_configs'#
extra_configs = 'extra_configs'#