classEvo2PreprocessingConfig(BaseModel):"""Pydantic model class specifying the configuration schema for a preprocessed IndexedDataset (.bin, .idx)."""# Pathsdatapaths:list[Path]=[]output_dir:None|Path=Noneoutput_prefix:None|str=None# Random Datasplittrain_split:float=0.7valid_split:float=0.2test_split:float=0.1# Overwrite existing binaries. Otherwise, skip already preprocessed datasets.overwrite:bool=False# Raw Preprocessing Transformsembed_reverse_complement:bool=Falserandom_reverse_complement:float=0.0random_lineage_dropout:float=0.0transcribe:None|Literal["transcribe","back_transcribe"]=Noneforce_uppercase:bool=Falseindexed_dataset_dtype:str="uint8"# Tokenization Transformsappend_eod:bool=Trueenforce_sample_length:None|int=Noneftfy:bool=False# NeMo Tokenizer Configurationtokenizer_type:Literal["Byte-Level","HuggingFace","SentencePiece","Regex","Megatron","Tiktoken",]="Byte-Level"vocab_file:None|Path=Nonevocab_size:None|int=512merges_file:None|Path=Nonetokenizer_model_name:None|str=Nonepretrained_tokenizer_model:None|str=Nonespecial_tokens:None|dict[str,str]={}fast_hf_tokenizer:bool=False# Compute Configuration# NOTE: If preprocessing a large amount of short individual sequences (< 1000 bp), do NOT use# multiprocessing (workers > 1) because sequence-level parallel IPC will dominate the preprocessing time!workers:int=1preproc_concurrency:int=100000chunksize:int=1# Filtersdrop_empty_sequences:bool=Falsennn_filter:bool=False# RNGseed:None|int=None# Evo2 Taxonomic Lineage Tags# SeqID Sub-String Indexing: "ABC" will have taxonomy data from "A".taxonomy_data:dict[str,Evo2TaxonomyLineage]={}# Periodicity of injecting phylogenetic lineage tags in the sequence prior to tokenization.prompt_spacer_length:int=131072
Evo2TaxonomyLineage
Bases: BaseModel
Pydantic model class that defines the source lineage of a DNA sequence.
Source code in bionemo/evo2/utils/config.py
26272829303132333435
classEvo2TaxonomyLineage(BaseModel):"""Pydantic model class that defines the source lineage of a DNA sequence."""domain:None|str=Nonephylum:None|str=Noneclazz:None|str=Noneorder:None|str=Nonefamily:None|str=Nonegenus:None|str=Nonespecies:None|str=None