from __future__ import annotations from typing import List, Tuple MODEL_NAMES = { 'Random': 'Random vectors', 'Random-ESM2-8': r'$Random ESM2_{8M}$', 'Random-ESM2-35': r'$Random ESM2_{35M}$', 'Random-ESM2-150': r'$Random ESM2_{150M}$', 'Random-ESM2-650': r'$Random ESM2_{650M}$', 'Random-Transformer': 'Random Transformer', 'ESM2-8': r'$ESM2_{8M}$', 'ESM2-35': r'$ESM2_{35M}$', 'ESM2-150': r'$ESM2_{150M}$', 'ESM2-650': r'$ESM2_{650M}$', 'ESM2-3B': r'$ESM2_{3B}$', 'ESM2-diff-150': r'$ESM2_{diff-150M}$', 'ESM2-diffAV-150': r'$ESM2_{diffAV-150M}$', 'ESMC-300': r'$ESMC_{300M}$', 'ESMC-600': r'$ESMC_{600M}$', 'E1-150': r'$E1_{150M}$', 'E1-300': r'$E1_{300M}$', 'E1-600': r'$E1_{600M}$', 'ProtBert': r'$ProtBert_{420M}$', 'ProtBert-BFD': r'$ProtBert_{BFD}$', 'ProtT5': r'ProtT5-enc$_{3B}$', 'ProtT5-XL-UniRef50-full-prec': r'ProtT5-XL$_{UniRef50}$', 'ProtT5-XXL-UniRef50': r'ProtT5-XXL$_{UniRef50}$', 'ProtT5-XL-BFD': r'ProtT5-XL$_{BFD}$', 'ProtT5-XXL-BFD': r'ProtT5-XXL$_{BFD}$', 'ANKH-Base': r'ANKH-Base$_{400M}$', 'ANKH-Large': r'ANKH-Large$_{1.2B}$', 'ANKH2-Large': r'ANKH2-Large$_{1.2B}$', 'DSM-150': r'$DSM_{150M}$', 'DSM-650': r'$DSM_{650M}$', 'DSM-PPI': r'$DSM_{PPI}$', 'GLM2-150': r'$GLM2_{150M}$', 'GLM2-650': r'$GLM2_{650M}$', 'GLM2-GAIA': r'$GLM2_{GAIA}$', 'DPLM-150': r'$DPLM_{150M}$', 'DPLM-650': r'$DPLM_{650M}$', 'DPLM-3B': r'$DPLM_{3B}$', 'ProtCLM-1b': r'$ProtCLM_{1B}$', 'OneHot-Protein': 'OneHot Protein', 'OneHot-DNA': 'OneHot DNA', 'OneHot-RNA': 'OneHot RNA', 'OneHot-Codon': 'OneHot Codon', 'AMPLIFY-120': r'$AMPLIFY_{120M}$', 'AMPLIFY-350': r'$AMPLIFY_{350M}$', } DATASET_NAMES = { # Gene Ontology and Enzyme Commission 'EC': 'EC', 'EC_reg': 'EC', 'GO-CC': r'$GO_{CC}$', 'CC_reg': r'$GO_{CC}$', 'GO-BP': r'$GO_{BP}$', 'BP_reg': r'$GO_{BP}$', 'GO-MF': r'$GO_{MF}$', 'MF_reg': r'$GO_{MF}$', # Basic protein properties 'MB': 'MB', 'MB_reg': 'MB', 'DeepLoc-2': r'$DL_{2}$', 'DL2_reg': r'$DL_{2}$', 'DeepLoc-10': r'$DL_{10}$', 'DL10_reg': r'$DL_{10}$', 'Subcellular': 'Subcellular', 'SL_13': 'Subcellular', 'enzyme-kcat': r'$k_{cat}$', 'enzyme_kcat': r'$k_{cat}$', 'solubility': 'solubility', 'solubility_prediction': 'solubility', 'localization': 'localization', 'localization_prediction': 'localization', 'temperature-stability': 'temperature stability', 'temperature_stability': 'temperature stability', 'optimal-temperature': 'optimal temperature', 'optimal_temperature': 'optimal temperature', 'optimal-ph': 'optimal pH', 'optimal_ph': 'optimal pH', 'material-production': 'material production', 'material_production': 'material production', 'fitness-prediction': 'fitness', 'fitness_prediction': 'fitness', 'number-of-folds': 'folds', 'fold_prediction': 'folds', 'cloning-clf': 'cloning-clf', 'cloning_clf': 'cloning-clf', 'stability-prediction': 'stability', 'stability_prediction': 'stability', 'ec-active': r'$EC_{singlelabel}$', 'ec_active': r'$EC_{singlelabel}$', 'ecoli_expression': 'E. coli expression', 'soluprot': 'soluprot', 'KSMoFinder-clustered': r'$KSMoFinder_{clustered}$', 'ksmo_clustered': r'$KSMoFinder_{clustered}$', 'KSMoFinder': 'KSMoFinder', 'KSmo_fixed': 'KSMoFinder', # Protein-protein interactions 'human-ppi-saprot': r'$Human-PPI_{saprot}$', 'HPPI': r'$Human-PPI_{saprot}$', 'human-ppi-pinui': r'$Human-PPI_{PiNUI}$', 'HPPI_PiNUI': r'$Human-PPI_{PiNUI}$', 'yeast-ppi-pinui': r'$Yeast-PPI_{PiNUI}$', 'YPPI_PiNUI': r'$Yeast-PPI_{PiNUI}$', 'peptide-HLA-MHC-affinity': 'peptide HLA MHC affinity', 'peptide_HLA_MHC_affinity_ppi': 'peptide HLA MHC affinity', 'shs27-ppi-raw': r'$SHS_{27k-raw}-ppi$', 'SHS27k': r'$SHS_{27k-raw}-ppi$', 'shs148-ppi-raw': r'$SHS_{148k-raw}-ppi$', 'SHS148k': r'$SHS_{148k-raw}-ppi$', 'shs27-ppi-random': r'$SHS_{27k-random}-ppi$', 'shs148-ppi-random': r'$SHS_{148k-random}-ppi$', 'shs27-ppi-dfs': r'$SHS_{27k-dfs}-ppi$', 'shs148-ppi-dfs': r'$SHS_{148k-dfs}-ppi$', 'shs27-ppi-bfs': r'$SHS_{27k-bfs}-ppi$', 'shs148-ppi-bfs': r'$SHS_{148k-bfs}-ppi$', 'string-ppi-random': r'$STRING_{random-ppi}$', 'string-ppi-dfs': r'$STRING_{dfs-ppi}$', 'string-ppi-bfs': r'$STRING_{bfs-ppi}$', 'ppi_SHS148k_bfs_2025': r'$SHS_{148k-bfs}-ppi$', 'ppi_SHS148k_dfs_2025': r'$SHS_{148k-dfs}-ppi$', 'ppi_SHS27k_bfs_2025': r'$SHS_{27k-bfs}-ppi$', 'ppi_SHS27k_dfs_2025': r'$SHS_{27k-dfs}-ppi$', 'ppi_SHS27k_random_2025': r'$SHS_{27k-random}-ppi$', 'ppi_SHS148k_random_2025': r'$SHS_{148k-random}-ppi$', 'ppi_STRING_random_2025': r'$STRING_{random}-ppi$', 'ppi_STRING_dfs_2025': r'$STRING_{dfs}-ppi$', 'ppi_STRING_bfs_2025': r'$STRING_{bfs}-ppi$', 'gold-ppi': r'$Human PPI_{bernett}$', 'bernett_gold_ppi': r'$Human PPI_{bernett}$', 'plm-interact': r'$PLM-Interact_{human / cross}$', 'plm_interact_human_train_cross_ppi': r'$PLM-Interact_{human / cross}$', 'ppi-mutation-effect': r'$PPI_{mutation effect}$', 'ppi_mutation_effect': r'$PPI_{mutation effect}$', 'PPA-ppi': r'$PPA_{PPI}$', 'ppi_affinity': r'$PPA_{PPI}$', 'ProteinProteinAffinity': 'PPI binding affinity', # Secondary structure 'SecondaryStructure-3': r'$SS_{3}$', 'SecondaryStructure-8': r'$SS_{8}$', 'SS3': r'$SS_{3}$', 'SS8': r'$SS_{8}$', # "Fitness" 'fluorescence-prediction': 'fluorescence', 'fluorescence_prediction': 'fluorescence', 'millionfull_round_1_oct_2025': r'$AtOMT1_{millionfull}$', 'million_full': r'$AtOMT1_{millionfull}$', # Special datasets 'plastic': r'$plastic degradation_{benchmark}$', 'plastic_degradation_benchmark': r'$plastic degradation_{benchmark}$', 'foldseek-fold': 'foldseek fold', 'foldseek-inverse': 'foldseek inverse', 'foldseek_dataset': 'foldseek', 'bernett_processed': r'$Bernett_{processed}$', # ProteinGym datasets 'proteingym_zs': r'$ProteinGym_{zero-shot}$', 'proteingym_supervised': r'$ProteinGym_{supervised}$', # Taxonomic datasets 'taxonomy_domain': r'$taxonomy_{domain}$', 'taxonomy_kingdom': r'$taxonomy_{kingdom}$', 'taxonomy_phylum': r'$taxonomy_{phylum}$', 'taxonomy_class': r'$taxonomy_{class}$', 'taxonomy_order': r'$taxonomy_{order}$', 'taxonomy_family': r'$taxonomy_{family}$', 'taxonomy_genus': r'$taxonomy_{genus}$', 'taxonomy_species': r'$taxonomy_{species}$', 'diff_phylogeny': r'$taxonomy_{different}$', 'diff_phylo': r'$taxonomy_{different}$', 'taxonomy_domain_0.4_clusters': r'$taxonomy_{domain}$', 'taxonomy_kingdom_0.4_clusters': r'$taxonomy_{kingdom}$', 'taxonomy_phylum_0.4_clusters': r'$taxonomy_{phylum}$', 'taxonomy_class_0.4_clusters': r'$taxonomy_{class}$', 'taxonomy_order_0.4_clusters': r'$taxonomy_{order}$', 'taxonomy_family_0.4_clusters': r'$taxonomy_{family}$', 'taxonomy_genus_0.4_clusters': r'$taxonomy_{genus}$', 'taxonomy_species_0.4_clusters': r'$taxonomy_{species}$', 'taxon_domain': r'$taxonomy_{domain}$', 'taxon_kingdom': r'$taxonomy_{kingdom}$', 'taxon_phylum': r'$taxonomy_{phylum}$', 'taxon_class': r'$taxonomy_{class}$', 'taxon_order': r'$taxonomy_{order}$', 'taxon_family': r'$taxonomy_{family}$', 'taxon_genus': r'$taxonomy_{genus}$', 'taxon_species': r'$taxonomy_{species}$', # Other datasets 'plddt': r'$pLDDT_{AlphaFold2}$', 'af2_plddt': r'$pLDDT_{AlphaFold2}$', 'realness': r'$Realness_{dataset}$', 'realness_dataset': r'$Realness_{dataset}$', } CLS_PREFS: List[Tuple[str, str]] = [ ("f1", "F1"), ("mcc", "MCC"), ("accuracy", "Accuracy"), ] REG_PREFS: List[Tuple[str, str]] = [ ("spearman", "Spearman rho"), ("r_squared", "R²"), ("pearson", "Pearson r"), ]