nikraf's picture
Upload folder using huggingface_hub
714cf46 verified
from __future__ import annotations
from typing import List, Tuple
MODEL_NAMES = {
'Random': 'Random vectors',
'Random-ESM2-8': r'$Random ESM2_{8M}$',
'Random-ESM2-35': r'$Random ESM2_{35M}$',
'Random-ESM2-150': r'$Random ESM2_{150M}$',
'Random-ESM2-650': r'$Random ESM2_{650M}$',
'Random-Transformer': 'Random Transformer',
'ESM2-8': r'$ESM2_{8M}$',
'ESM2-35': r'$ESM2_{35M}$',
'ESM2-150': r'$ESM2_{150M}$',
'ESM2-650': r'$ESM2_{650M}$',
'ESM2-3B': r'$ESM2_{3B}$',
'ESM2-diff-150': r'$ESM2_{diff-150M}$',
'ESM2-diffAV-150': r'$ESM2_{diffAV-150M}$',
'ESMC-300': r'$ESMC_{300M}$',
'ESMC-600': r'$ESMC_{600M}$',
'E1-150': r'$E1_{150M}$',
'E1-300': r'$E1_{300M}$',
'E1-600': r'$E1_{600M}$',
'ProtBert': r'$ProtBert_{420M}$',
'ProtBert-BFD': r'$ProtBert_{BFD}$',
'ProtT5': r'ProtT5-enc$_{3B}$',
'ProtT5-XL-UniRef50-full-prec': r'ProtT5-XL$_{UniRef50}$',
'ProtT5-XXL-UniRef50': r'ProtT5-XXL$_{UniRef50}$',
'ProtT5-XL-BFD': r'ProtT5-XL$_{BFD}$',
'ProtT5-XXL-BFD': r'ProtT5-XXL$_{BFD}$',
'ANKH-Base': r'ANKH-Base$_{400M}$',
'ANKH-Large': r'ANKH-Large$_{1.2B}$',
'ANKH2-Large': r'ANKH2-Large$_{1.2B}$',
'DSM-150': r'$DSM_{150M}$',
'DSM-650': r'$DSM_{650M}$',
'DSM-PPI': r'$DSM_{PPI}$',
'GLM2-150': r'$GLM2_{150M}$',
'GLM2-650': r'$GLM2_{650M}$',
'GLM2-GAIA': r'$GLM2_{GAIA}$',
'DPLM-150': r'$DPLM_{150M}$',
'DPLM-650': r'$DPLM_{650M}$',
'DPLM-3B': r'$DPLM_{3B}$',
'ProtCLM-1b': r'$ProtCLM_{1B}$',
'OneHot-Protein': 'OneHot Protein',
'OneHot-DNA': 'OneHot DNA',
'OneHot-RNA': 'OneHot RNA',
'OneHot-Codon': 'OneHot Codon',
'AMPLIFY-120': r'$AMPLIFY_{120M}$',
'AMPLIFY-350': r'$AMPLIFY_{350M}$',
}
DATASET_NAMES = {
# Gene Ontology and Enzyme Commission
'EC': 'EC',
'EC_reg': 'EC',
'GO-CC': r'$GO_{CC}$',
'CC_reg': r'$GO_{CC}$',
'GO-BP': r'$GO_{BP}$',
'BP_reg': r'$GO_{BP}$',
'GO-MF': r'$GO_{MF}$',
'MF_reg': r'$GO_{MF}$',
# Basic protein properties
'MB': 'MB',
'MB_reg': 'MB',
'DeepLoc-2': r'$DL_{2}$',
'DL2_reg': r'$DL_{2}$',
'DeepLoc-10': r'$DL_{10}$',
'DL10_reg': r'$DL_{10}$',
'Subcellular': 'Subcellular',
'SL_13': 'Subcellular',
'enzyme-kcat': r'$k_{cat}$',
'enzyme_kcat': r'$k_{cat}$',
'solubility': 'solubility',
'solubility_prediction': 'solubility',
'localization': 'localization',
'localization_prediction': 'localization',
'temperature-stability': 'temperature stability',
'temperature_stability': 'temperature stability',
'optimal-temperature': 'optimal temperature',
'optimal_temperature': 'optimal temperature',
'optimal-ph': 'optimal pH',
'optimal_ph': 'optimal pH',
'material-production': 'material production',
'material_production': 'material production',
'fitness-prediction': 'fitness',
'fitness_prediction': 'fitness',
'number-of-folds': 'folds',
'fold_prediction': 'folds',
'cloning-clf': 'cloning-clf',
'cloning_clf': 'cloning-clf',
'stability-prediction': 'stability',
'stability_prediction': 'stability',
'ec-active': r'$EC_{singlelabel}$',
'ec_active': r'$EC_{singlelabel}$',
'ecoli_expression': 'E. coli expression',
'soluprot': 'soluprot',
'KSMoFinder-clustered': r'$KSMoFinder_{clustered}$',
'ksmo_clustered': r'$KSMoFinder_{clustered}$',
'KSMoFinder': 'KSMoFinder',
'KSmo_fixed': 'KSMoFinder',
# Protein-protein interactions
'human-ppi-saprot': r'$Human-PPI_{saprot}$',
'HPPI': r'$Human-PPI_{saprot}$',
'human-ppi-pinui': r'$Human-PPI_{PiNUI}$',
'HPPI_PiNUI': r'$Human-PPI_{PiNUI}$',
'yeast-ppi-pinui': r'$Yeast-PPI_{PiNUI}$',
'YPPI_PiNUI': r'$Yeast-PPI_{PiNUI}$',
'peptide-HLA-MHC-affinity': 'peptide HLA MHC affinity',
'peptide_HLA_MHC_affinity_ppi': 'peptide HLA MHC affinity',
'shs27-ppi-raw': r'$SHS_{27k-raw}-ppi$',
'SHS27k': r'$SHS_{27k-raw}-ppi$',
'shs148-ppi-raw': r'$SHS_{148k-raw}-ppi$',
'SHS148k': r'$SHS_{148k-raw}-ppi$',
'shs27-ppi-random': r'$SHS_{27k-random}-ppi$',
'shs148-ppi-random': r'$SHS_{148k-random}-ppi$',
'shs27-ppi-dfs': r'$SHS_{27k-dfs}-ppi$',
'shs148-ppi-dfs': r'$SHS_{148k-dfs}-ppi$',
'shs27-ppi-bfs': r'$SHS_{27k-bfs}-ppi$',
'shs148-ppi-bfs': r'$SHS_{148k-bfs}-ppi$',
'string-ppi-random': r'$STRING_{random-ppi}$',
'string-ppi-dfs': r'$STRING_{dfs-ppi}$',
'string-ppi-bfs': r'$STRING_{bfs-ppi}$',
'ppi_SHS148k_bfs_2025': r'$SHS_{148k-bfs}-ppi$',
'ppi_SHS148k_dfs_2025': r'$SHS_{148k-dfs}-ppi$',
'ppi_SHS27k_bfs_2025': r'$SHS_{27k-bfs}-ppi$',
'ppi_SHS27k_dfs_2025': r'$SHS_{27k-dfs}-ppi$',
'ppi_SHS27k_random_2025': r'$SHS_{27k-random}-ppi$',
'ppi_SHS148k_random_2025': r'$SHS_{148k-random}-ppi$',
'ppi_STRING_random_2025': r'$STRING_{random}-ppi$',
'ppi_STRING_dfs_2025': r'$STRING_{dfs}-ppi$',
'ppi_STRING_bfs_2025': r'$STRING_{bfs}-ppi$',
'gold-ppi': r'$Human PPI_{bernett}$',
'bernett_gold_ppi': r'$Human PPI_{bernett}$',
'plm-interact': r'$PLM-Interact_{human / cross}$',
'plm_interact_human_train_cross_ppi': r'$PLM-Interact_{human / cross}$',
'ppi-mutation-effect': r'$PPI_{mutation effect}$',
'ppi_mutation_effect': r'$PPI_{mutation effect}$',
'PPA-ppi': r'$PPA_{PPI}$',
'ppi_affinity': r'$PPA_{PPI}$',
'ProteinProteinAffinity': 'PPI binding affinity',
# Secondary structure
'SecondaryStructure-3': r'$SS_{3}$',
'SecondaryStructure-8': r'$SS_{8}$',
'SS3': r'$SS_{3}$',
'SS8': r'$SS_{8}$',
# "Fitness"
'fluorescence-prediction': 'fluorescence',
'fluorescence_prediction': 'fluorescence',
'millionfull_round_1_oct_2025': r'$AtOMT1_{millionfull}$',
'million_full': r'$AtOMT1_{millionfull}$',
# Special datasets
'plastic': r'$plastic degradation_{benchmark}$',
'plastic_degradation_benchmark': r'$plastic degradation_{benchmark}$',
'foldseek-fold': 'foldseek fold',
'foldseek-inverse': 'foldseek inverse',
'foldseek_dataset': 'foldseek',
'bernett_processed': r'$Bernett_{processed}$',
# ProteinGym datasets
'proteingym_zs': r'$ProteinGym_{zero-shot}$',
'proteingym_supervised': r'$ProteinGym_{supervised}$',
# Taxonomic datasets
'taxonomy_domain': r'$taxonomy_{domain}$',
'taxonomy_kingdom': r'$taxonomy_{kingdom}$',
'taxonomy_phylum': r'$taxonomy_{phylum}$',
'taxonomy_class': r'$taxonomy_{class}$',
'taxonomy_order': r'$taxonomy_{order}$',
'taxonomy_family': r'$taxonomy_{family}$',
'taxonomy_genus': r'$taxonomy_{genus}$',
'taxonomy_species': r'$taxonomy_{species}$',
'diff_phylogeny': r'$taxonomy_{different}$',
'diff_phylo': r'$taxonomy_{different}$',
'taxonomy_domain_0.4_clusters': r'$taxonomy_{domain}$',
'taxonomy_kingdom_0.4_clusters': r'$taxonomy_{kingdom}$',
'taxonomy_phylum_0.4_clusters': r'$taxonomy_{phylum}$',
'taxonomy_class_0.4_clusters': r'$taxonomy_{class}$',
'taxonomy_order_0.4_clusters': r'$taxonomy_{order}$',
'taxonomy_family_0.4_clusters': r'$taxonomy_{family}$',
'taxonomy_genus_0.4_clusters': r'$taxonomy_{genus}$',
'taxonomy_species_0.4_clusters': r'$taxonomy_{species}$',
'taxon_domain': r'$taxonomy_{domain}$',
'taxon_kingdom': r'$taxonomy_{kingdom}$',
'taxon_phylum': r'$taxonomy_{phylum}$',
'taxon_class': r'$taxonomy_{class}$',
'taxon_order': r'$taxonomy_{order}$',
'taxon_family': r'$taxonomy_{family}$',
'taxon_genus': r'$taxonomy_{genus}$',
'taxon_species': r'$taxonomy_{species}$',
# Other datasets
'plddt': r'$pLDDT_{AlphaFold2}$',
'af2_plddt': r'$pLDDT_{AlphaFold2}$',
'realness': r'$Realness_{dataset}$',
'realness_dataset': r'$Realness_{dataset}$',
}
CLS_PREFS: List[Tuple[str, str]] = [
("f1", "F1"),
("mcc", "MCC"),
("accuracy", "Accuracy"),
]
REG_PREFS: List[Tuple[str, str]] = [
("spearman", "Spearman rho"),
("r_squared", "R²"),
("pearson", "Pearson r"),
]