nikraf's picture
Upload folder using huggingface_hub
714cf46 verified
import pandas as pd
from typing import Optional
from huggingface_hub import hf_hub_download
def _load_parquet_by_dms(repo_id: str, dms_id: str) -> Optional[pd.DataFrame]:
"""
Loads a single-assay parquet shard from the Hub at by_dms_id/{DMS_id}.parquet.
"""
assay_files = []
id_str = str(dms_id)
assay_files.append(f"by_dms_id/{id_str}.parquet")
for filename in assay_files:
local_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset")
df = pd.read_parquet(local_path)
return df
def load_proteingym_dms(dms_id: str, mode: str, repo_id: str = "GleghornLab/ProteinGym_DMS") -> pd.DataFrame:
"""
Load a single ProteinGym DMS assay, processing columns as specified.
Modes:
- "benchmark": Keeps the columns for standard substitution benchmark.
- "indels": Keeps only indels assays.
- "singles": Keeps only single substitutions variants.
- "multiples": Keeps only multiple substitutions variants.
"""
df = _load_parquet_by_dms(repo_id=repo_id, dms_id=dms_id)
if mode == 'benchmark':
df = df[df['is_indel'] == False]
df = df[["DMS_id", "mutated_seq", "target_seq", "DMS_score", "DMS_score_bin", "mutant"]]
elif mode == 'indels':
# Indels only, no mutant triplet column
df = df[df['is_indel'] == True]
df = df[["DMS_id", "mutated_seq", "target_seq", "DMS_score", "DMS_score_bin"]]
elif mode == 'singles':
# Single substitutions only
df = df[df['is_indel'] == False]
df = df[df['num_mutations'] == 1]
df = df[["DMS_id", "mutated_seq", "target_seq", "DMS_score", "DMS_score_bin", "mutant"]]
elif mode == 'multiples':
# Multiple substitutions only
df = df[df['is_indel'] == False]
df = df[df['num_mutations'] > 1]
df = df[["DMS_id", "mutated_seq", "target_seq", "DMS_score", "DMS_score_bin", "mutant"]]
return df.reset_index(drop=True)