| import pandas as pd |
| import os |
| import subprocess |
| import sys |
| from Bio import SeqIO |
| import shutil |
|
|
| import rootutils |
| import logging |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| def ensure_mmseqs_in_path(mmseqs_dir): |
| """ |
| Checks if MMseqs2 is in the PATH. If it's not, add it. MMseqs2 will not run if this is not done correctly. |
| |
| Args: |
| mmseqs_dir (str): Directory containing MMseqs2 binaries |
| """ |
| mmseqs_bin = os.path.join(mmseqs_dir, "mmseqs") |
|
|
| |
| if shutil.which("mmseqs") is None: |
| |
| os.environ["PATH"] = f"{mmseqs_dir}:{os.environ['PATH']}" |
| logger.info(f"\tAdded {mmseqs_dir} to PATH") |
|
|
|
|
| def process_fasta(fasta_path): |
| fasta_sequences = SeqIO.parse(open(fasta_path), "fasta") |
| d = {} |
| for fasta in fasta_sequences: |
| id, sequence = fasta.id, str(fasta.seq) |
|
|
| d[id] = sequence |
|
|
| return d |
|
|
|
|
| def analyze_clustering_result(input_fasta: str, tsv_path: str): |
| """ |
| Args: |
| input_fasta (str): path to input fasta file |
| """ |
|
|
| |
| input_d = process_fasta(input_fasta) |
|
|
| |
| clusters = pd.read_csv(f"{tsv_path}", sep="\t", header=None) |
| clusters = clusters.rename(columns={0: "representative seq_id", 1: "member seq_id"}) |
|
|
| clusters["representative seq"] = clusters["representative seq_id"].apply( |
| lambda seq_id: input_d[seq_id] |
| ) |
| clusters["member seq"] = clusters["member seq_id"].apply( |
| lambda seq_id: input_d[seq_id] |
| ) |
|
|
| |
| clusters = clusters.sort_values( |
| by=["representative seq_id", "member seq_id"], ascending=True |
| ).reset_index(drop=True) |
|
|
| return clusters |
|
|
|
|
| def make_fasta(sequences: dict, fasta_path: str): |
| """ |
| Makes a fasta file from sequences, where the key is the header and the value is the sequence. |
| |
| Args: |
| sequences (dict): A dictionary where the key is the header and the value is the sequence. |
| |
| Returns: |
| str: The path to the fasta file. |
| """ |
| with open(fasta_path, "w") as f: |
| for header, sequence in sequences.items(): |
| f.write(f">{header}\n{sequence}\n") |
|
|
| return fasta_path |
|
|
|
|
| def run_mmseqs_clustering( |
| input_fasta, |
| output_dir, |
| min_seq_id=0.3, |
| c=0.8, |
| cov_mode=0, |
| cluster_mode=0, |
| path_to_mmseqs="fuson_plm/mmseqs", |
| dbtype=1, |
| ): |
| """ |
| Runs MMSeqs2 clustering using easycluster module |
| |
| Args: |
| input_fasta (str): path to input fasta file, formatted >header\nsequence\n>header\nsequence.... |
| output_dir (str): path to output dir for clustering results |
| min_seq_id (float): number [0,1] representing --min-seq-id in cluster command |
| c (float): nunber [0,1] representing -c in cluster command |
| cov_mode (int): number 0, 1, 2, or 3 representing --cov-mode in cluster command |
| cluster_mode (int): number 0, 1, or 2 representing --cluster-mode in cluster command |
| |
| """ |
| |
| logger.info("\nRunning MMSeqs clustering...") |
| mmseqs_dir = os.path.join( |
| path_to_mmseqs[0 : path_to_mmseqs.index("/mmseqs")], "mmseqs/bin" |
| ) |
| logger.info(f"Running mmseqs clustering from {mmseqs_dir}") |
|
|
| |
| ensure_mmseqs_in_path(mmseqs_dir) |
|
|
| |
| mmseqs_bin = "mmseqs" |
|
|
| |
| os.makedirs(output_dir, exist_ok=True) |
|
|
| |
| cmd_easy_cluster = [ |
| mmseqs_bin, |
| "easy-cluster", |
| input_fasta, |
| os.path.join(output_dir, "mmseqs"), |
| output_dir, |
| "--min-seq-id", |
| str(min_seq_id), |
| "-c", |
| str(c), |
| "--cov-mode", |
| str(cov_mode), |
| "--cluster-mode", |
| str(cluster_mode), |
| "--dbtype", |
| str(dbtype), |
| ] |
|
|
| |
| logger.info("\n\tCommand entered to MMSeqs2:") |
| logger.info("\t" + " ".join(cmd_easy_cluster) + "\n") |
|
|
| subprocess.run(cmd_easy_cluster, check=True) |
|
|
| logger.info(f"Clustering completed. Results are in {output_dir}") |
|
|
|
|
| def cluster_summary(clusters: pd.DataFrame): |
| """ |
| Summarizes how many clusters were formed, how big they are, etc ... |
| """ |
| grouped_clusters = ( |
| clusters.groupby("representative seq_id")["member seq_id"] |
| .count() |
| .reset_index() |
| .rename(columns={"member seq_id": "member count"}) |
| ) |
| assert len(grouped_clusters) == len( |
| clusters["representative seq_id"].unique() |
| ) |
|
|
| total_seqs = sum(grouped_clusters["member count"]) |
| logger.info(f"Created {len(grouped_clusters)} clusters of {total_seqs} sequences") |
| logger.info( |
| f"\t{len(grouped_clusters.loc[grouped_clusters['member count']==1])} clusters of size 1" |
| ) |
| csize1_seqs = sum( |
| grouped_clusters[grouped_clusters["member count"] == 1]["member count"] |
| ) |
| logger.info( |
| f"\t\tsequences: {csize1_seqs} ({round(100*csize1_seqs/total_seqs, 2)}%)" |
| ) |
|
|
| logger.info( |
| f"\t{len(grouped_clusters.loc[grouped_clusters['member count']>1])} clusters of size > 1" |
| ) |
| csizeg1_seqs = sum( |
| grouped_clusters[grouped_clusters["member count"] > 1]["member count"] |
| ) |
| logger.info( |
| f"\t\tsequences: {csizeg1_seqs} ({round(100*csizeg1_seqs/total_seqs, 2)}%)" |
| ) |
| logger.info(f"\tlargest cluster: {max(grouped_clusters['member count'])}") |
|
|
| logger.info("\nCluster size breakdown below...") |
|
|
| value_counts = ( |
| grouped_clusters["member count"] |
| .value_counts() |
| .reset_index() |
| .rename( |
| columns={"member count": "cluster size (n_members)", "count": "n_clusters"} |
| ) |
| ) |
| logger.info( |
| value_counts.sort_values( |
| by="cluster size (n_members)", ascending=True |
| ).to_string(index=False) |
| ) |
|
|