| """Command-line interface: `sf-cluster build ...`.""" |
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import sys |
| from pathlib import Path |
|
|
| import numpy as np |
|
|
| from . import __version__ |
| from .methods import N_SUBSETS, TARGET_SIZE, build_subsets |
|
|
|
|
| def _add_build_parser(sub: argparse._SubParsersAction) -> None: |
| p = sub.add_parser( |
| "build", |
| help="Build N MSA subsets from a filtered A3M + per-residue FI matrix.", |
| ) |
| p.add_argument("--a3m", required=True, type=Path, |
| help="path to filtered A3M file") |
| p.add_argument("--fi", required=True, type=Path, |
| help="path to per-residue FI matrix .npy (N_seq, L)") |
| p.add_argument("--method", required=True, choices=["mosaic", "gradient"], |
| help="subset construction method") |
| p.add_argument("--n-subsets", type=int, default=N_SUBSETS, |
| help=f"number of subsets (default {N_SUBSETS})") |
| p.add_argument("--subset-size", type=int, default=TARGET_SIZE, |
| help=f"sequences per subset (default {TARGET_SIZE})") |
| p.add_argument("--hv-percentile", type=float, default=80.0, |
| help="column-variance percentile for HV mask (default 80)") |
| p.add_argument("--seed", type=int, default=20260422, |
| help="global RNG seed tag (recorded in sidecar; " |
| "per-subset seeds are method-deterministic)") |
| p.add_argument("--query-index", type=int, default=0, |
| help="index of query in the A3M pool (default 0)") |
| p.add_argument("--out", required=True, type=Path, |
| help="output directory for subset A3Ms") |
| p.set_defaults(func=_cmd_build) |
|
|
|
|
| def _cmd_build(args: argparse.Namespace) -> int: |
| if not args.a3m.exists(): |
| print(f"error: A3M not found: {args.a3m}", file=sys.stderr) |
| return 2 |
| if not args.fi.exists(): |
| print(f"error: FI matrix not found: {args.fi}", file=sys.stderr) |
| return 2 |
|
|
| args.out.mkdir(parents=True, exist_ok=True) |
|
|
| pool, score, subsets, paths = build_subsets( |
| a3m_path=args.a3m, |
| fi_npy_path=args.fi, |
| method=args.method, |
| n_subsets=args.n_subsets, |
| subset_size=args.subset_size, |
| hv_percentile=args.hv_percentile, |
| out_dir=args.out, |
| query_index=args.query_index, |
| ) |
|
|
| |
| idx_tsv = args.out / f"{args.method}_subset_index.tsv" |
| with open(idx_tsv, "w") as fh: |
| fh.write("subset_id\tseq_index\tpool_index\theader\tcontrast_hvlv\n") |
| for s_i, idx_list in enumerate(subsets): |
| for j, p_i in enumerate(idx_list): |
| fh.write(f"{s_i:03d}\t{j}\t{p_i}\t{pool.headers[p_i]}\t" |
| f"{score[p_i]:.6f}\n") |
|
|
| |
| meta = { |
| "sf_cluster_version": __version__, |
| "method": args.method, |
| "a3m": str(args.a3m.resolve()), |
| "fi_matrix": str(args.fi.resolve()), |
| "n_subsets": args.n_subsets, |
| "subset_size": args.subset_size, |
| "hv_percentile": args.hv_percentile, |
| "pool_size": pool.n_seq, |
| "n_cols": pool.n_cols, |
| "seed_tag": args.seed, |
| "query_header": pool.headers[args.query_index], |
| "score_stats": { |
| "min": float(np.min(score)), |
| "max": float(np.max(score)), |
| "mean": float(np.mean(score)), |
| "std": float(np.std(score)), |
| }, |
| } |
| (args.out / f"{args.method}_meta.json").write_text(json.dumps(meta, indent=2)) |
|
|
| print(f"[sf-cluster] method={args.method} pool={pool.n_seq} " |
| f"wrote {len(paths)} A3Ms to {args.out}") |
| return 0 |
|
|
|
|
| def build_parser() -> argparse.ArgumentParser: |
| p = argparse.ArgumentParser( |
| prog="sf-cluster", |
| description="Frustration-guided MSA subset builders " |
| "(mosaic + gradient).", |
| ) |
| p.add_argument("--version", action="version", |
| version=f"sf-cluster {__version__}") |
| sub = p.add_subparsers(dest="command", required=True) |
| _add_build_parser(sub) |
| return p |
|
|
|
|
| def main(argv=None) -> int: |
| parser = build_parser() |
| args = parser.parse_args(argv) |
| return args.func(args) |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|