"""Command-line interface: `sf-cluster build ...`.""" from __future__ import annotations import argparse import json import sys from pathlib import Path import numpy as np from . import __version__ from .methods import N_SUBSETS, TARGET_SIZE, build_subsets def _add_build_parser(sub: argparse._SubParsersAction) -> None: p = sub.add_parser( "build", help="Build N MSA subsets from a filtered A3M + per-residue FI matrix.", ) p.add_argument("--a3m", required=True, type=Path, help="path to filtered A3M file") p.add_argument("--fi", required=True, type=Path, help="path to per-residue FI matrix .npy (N_seq, L)") p.add_argument("--method", required=True, choices=["mosaic", "gradient"], help="subset construction method") p.add_argument("--n-subsets", type=int, default=N_SUBSETS, help=f"number of subsets (default {N_SUBSETS})") p.add_argument("--subset-size", type=int, default=TARGET_SIZE, help=f"sequences per subset (default {TARGET_SIZE})") p.add_argument("--hv-percentile", type=float, default=80.0, help="column-variance percentile for HV mask (default 80)") p.add_argument("--seed", type=int, default=20260422, help="global RNG seed tag (recorded in sidecar; " "per-subset seeds are method-deterministic)") p.add_argument("--query-index", type=int, default=0, help="index of query in the A3M pool (default 0)") p.add_argument("--out", required=True, type=Path, help="output directory for subset A3Ms") p.set_defaults(func=_cmd_build) def _cmd_build(args: argparse.Namespace) -> int: if not args.a3m.exists(): print(f"error: A3M not found: {args.a3m}", file=sys.stderr) return 2 if not args.fi.exists(): print(f"error: FI matrix not found: {args.fi}", file=sys.stderr) return 2 args.out.mkdir(parents=True, exist_ok=True) pool, score, subsets, paths = build_subsets( a3m_path=args.a3m, fi_npy_path=args.fi, method=args.method, n_subsets=args.n_subsets, subset_size=args.subset_size, hv_percentile=args.hv_percentile, out_dir=args.out, query_index=args.query_index, ) # Sidecar: subset index TSV idx_tsv = args.out / f"{args.method}_subset_index.tsv" with open(idx_tsv, "w") as fh: fh.write("subset_id\tseq_index\tpool_index\theader\tcontrast_hvlv\n") for s_i, idx_list in enumerate(subsets): for j, p_i in enumerate(idx_list): fh.write(f"{s_i:03d}\t{j}\t{p_i}\t{pool.headers[p_i]}\t" f"{score[p_i]:.6f}\n") # Sidecar: provenance JSON meta = { "sf_cluster_version": __version__, "method": args.method, "a3m": str(args.a3m.resolve()), "fi_matrix": str(args.fi.resolve()), "n_subsets": args.n_subsets, "subset_size": args.subset_size, "hv_percentile": args.hv_percentile, "pool_size": pool.n_seq, "n_cols": pool.n_cols, "seed_tag": args.seed, "query_header": pool.headers[args.query_index], "score_stats": { "min": float(np.min(score)), "max": float(np.max(score)), "mean": float(np.mean(score)), "std": float(np.std(score)), }, } (args.out / f"{args.method}_meta.json").write_text(json.dumps(meta, indent=2)) print(f"[sf-cluster] method={args.method} pool={pool.n_seq} " f"wrote {len(paths)} A3Ms to {args.out}") return 0 def build_parser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="sf-cluster", description="Frustration-guided MSA subset builders " "(mosaic + gradient).", ) p.add_argument("--version", action="version", version=f"sf-cluster {__version__}") sub = p.add_subparsers(dest="command", required=True) _add_build_parser(sub) return p def main(argv=None) -> int: parser = build_parser() args = parser.parse_args(argv) return args.func(args) if __name__ == "__main__": sys.exit(main())