import pandas as pd import os import subprocess import sys from Bio import SeqIO import shutil import rootutils import logging logger = logging.getLogger(__name__) def ensure_mmseqs_in_path(mmseqs_dir): """ Checks if MMseqs2 is in the PATH. If it's not, add it. MMseqs2 will not run if this is not done correctly. Args: mmseqs_dir (str): Directory containing MMseqs2 binaries """ mmseqs_bin = os.path.join(mmseqs_dir, "mmseqs") # Check if mmseqs is already in PATH if shutil.which("mmseqs") is None: # Export the MMseqs2 directory to PATH os.environ["PATH"] = f"{mmseqs_dir}:{os.environ['PATH']}" logger.info(f"\tAdded {mmseqs_dir} to PATH") def process_fasta(fasta_path): fasta_sequences = SeqIO.parse(open(fasta_path), "fasta") d = {} for fasta in fasta_sequences: id, sequence = fasta.id, str(fasta.seq) d[id] = sequence return d def analyze_clustering_result(input_fasta: str, tsv_path: str): """ Args: input_fasta (str): path to input fasta file """ # Process input fasta input_d = process_fasta(input_fasta) # Process clusters.tsv clusters = pd.read_csv(f"{tsv_path}", sep="\t", header=None) clusters = clusters.rename(columns={0: "representative seq_id", 1: "member seq_id"}) clusters["representative seq"] = clusters["representative seq_id"].apply( lambda seq_id: input_d[seq_id] ) clusters["member seq"] = clusters["member seq_id"].apply( lambda seq_id: input_d[seq_id] ) # Sort them so that splitting results are reproducible clusters = clusters.sort_values( by=["representative seq_id", "member seq_id"], ascending=True ).reset_index(drop=True) return clusters def make_fasta(sequences: dict, fasta_path: str): """ Makes a fasta file from sequences, where the key is the header and the value is the sequence. Args: sequences (dict): A dictionary where the key is the header and the value is the sequence. Returns: str: The path to the fasta file. """ with open(fasta_path, "w") as f: for header, sequence in sequences.items(): f.write(f">{header}\n{sequence}\n") return fasta_path def run_mmseqs_clustering( input_fasta, output_dir, min_seq_id=0.3, c=0.8, cov_mode=0, cluster_mode=0, path_to_mmseqs="fuson_plm/mmseqs", dbtype=1, ): """ Runs MMSeqs2 clustering using easycluster module Args: input_fasta (str): path to input fasta file, formatted >header\nsequence\n>header\nsequence.... output_dir (str): path to output dir for clustering results min_seq_id (float): number [0,1] representing --min-seq-id in cluster command c (float): nunber [0,1] representing -c in cluster command cov_mode (int): number 0, 1, 2, or 3 representing --cov-mode in cluster command cluster_mode (int): number 0, 1, or 2 representing --cluster-mode in cluster command """ # Get mmseqs dir logger.info("\nRunning MMSeqs clustering...") mmseqs_dir = os.path.join( path_to_mmseqs[0 : path_to_mmseqs.index("/mmseqs")], "mmseqs/bin" ) logger.info(f"Running mmseqs clustering from {mmseqs_dir}") # Ensure MMseqs2 is in the PATH ensure_mmseqs_in_path(mmseqs_dir) # Define paths for MMseqs2 mmseqs_bin = "mmseqs" # Ensure this is in your PATH or provide the full path to mmseqs binary # Create the output directory os.makedirs(output_dir, exist_ok=True) # Run MMseqs2 easy-cluster cmd_easy_cluster = [ mmseqs_bin, "easy-cluster", input_fasta, os.path.join(output_dir, "mmseqs"), output_dir, "--min-seq-id", str(min_seq_id), "-c", str(c), "--cov-mode", str(cov_mode), "--cluster-mode", str(cluster_mode), "--dbtype", str(dbtype), ] # Write the command to a log file logger.info("\n\tCommand entered to MMSeqs2:") logger.info("\t" + " ".join(cmd_easy_cluster) + "\n") subprocess.run(cmd_easy_cluster, check=True) logger.info(f"Clustering completed. Results are in {output_dir}") def cluster_summary(clusters: pd.DataFrame): """ Summarizes how many clusters were formed, how big they are, etc ... """ grouped_clusters = ( clusters.groupby("representative seq_id")["member seq_id"] .count() .reset_index() .rename(columns={"member seq_id": "member count"}) ) assert len(grouped_clusters) == len( clusters["representative seq_id"].unique() ) # make sure number of cluster reps = # grouped clusters total_seqs = sum(grouped_clusters["member count"]) logger.info(f"Created {len(grouped_clusters)} clusters of {total_seqs} sequences") logger.info( f"\t{len(grouped_clusters.loc[grouped_clusters['member count']==1])} clusters of size 1" ) csize1_seqs = sum( grouped_clusters[grouped_clusters["member count"] == 1]["member count"] ) logger.info( f"\t\tsequences: {csize1_seqs} ({round(100*csize1_seqs/total_seqs, 2)}%)" ) logger.info( f"\t{len(grouped_clusters.loc[grouped_clusters['member count']>1])} clusters of size > 1" ) csizeg1_seqs = sum( grouped_clusters[grouped_clusters["member count"] > 1]["member count"] ) logger.info( f"\t\tsequences: {csizeg1_seqs} ({round(100*csizeg1_seqs/total_seqs, 2)}%)" ) logger.info(f"\tlargest cluster: {max(grouped_clusters['member count'])}") logger.info("\nCluster size breakdown below...") value_counts = ( grouped_clusters["member count"] .value_counts() .reset_index() .rename( columns={"member count": "cluster size (n_members)", "count": "n_clusters"} ) ) logger.info( value_counts.sort_values( by="cluster size (n_members)", ascending=True ).to_string(index=False) )