File size: 6,061 Bytes
80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 80b6a2c 29899b4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | import pandas as pd
import os
import subprocess
import sys
from Bio import SeqIO
import shutil
import rootutils
import logging
logger = logging.getLogger(__name__)
def ensure_mmseqs_in_path(mmseqs_dir):
"""
Checks if MMseqs2 is in the PATH. If it's not, add it. MMseqs2 will not run if this is not done correctly.
Args:
mmseqs_dir (str): Directory containing MMseqs2 binaries
"""
mmseqs_bin = os.path.join(mmseqs_dir, "mmseqs")
# Check if mmseqs is already in PATH
if shutil.which("mmseqs") is None:
# Export the MMseqs2 directory to PATH
os.environ["PATH"] = f"{mmseqs_dir}:{os.environ['PATH']}"
logger.info(f"\tAdded {mmseqs_dir} to PATH")
def process_fasta(fasta_path):
fasta_sequences = SeqIO.parse(open(fasta_path), "fasta")
d = {}
for fasta in fasta_sequences:
id, sequence = fasta.id, str(fasta.seq)
d[id] = sequence
return d
def analyze_clustering_result(input_fasta: str, tsv_path: str):
"""
Args:
input_fasta (str): path to input fasta file
"""
# Process input fasta
input_d = process_fasta(input_fasta)
# Process clusters.tsv
clusters = pd.read_csv(f"{tsv_path}", sep="\t", header=None)
clusters = clusters.rename(columns={0: "representative seq_id", 1: "member seq_id"})
clusters["representative seq"] = clusters["representative seq_id"].apply(
lambda seq_id: input_d[seq_id]
)
clusters["member seq"] = clusters["member seq_id"].apply(
lambda seq_id: input_d[seq_id]
)
# Sort them so that splitting results are reproducible
clusters = clusters.sort_values(
by=["representative seq_id", "member seq_id"], ascending=True
).reset_index(drop=True)
return clusters
def make_fasta(sequences: dict, fasta_path: str):
"""
Makes a fasta file from sequences, where the key is the header and the value is the sequence.
Args:
sequences (dict): A dictionary where the key is the header and the value is the sequence.
Returns:
str: The path to the fasta file.
"""
with open(fasta_path, "w") as f:
for header, sequence in sequences.items():
f.write(f">{header}\n{sequence}\n")
return fasta_path
def run_mmseqs_clustering(
input_fasta,
output_dir,
min_seq_id=0.3,
c=0.8,
cov_mode=0,
cluster_mode=0,
path_to_mmseqs="fuson_plm/mmseqs",
dbtype=1,
):
"""
Runs MMSeqs2 clustering using easycluster module
Args:
input_fasta (str): path to input fasta file, formatted >header\nsequence\n>header\nsequence....
output_dir (str): path to output dir for clustering results
min_seq_id (float): number [0,1] representing --min-seq-id in cluster command
c (float): nunber [0,1] representing -c in cluster command
cov_mode (int): number 0, 1, 2, or 3 representing --cov-mode in cluster command
cluster_mode (int): number 0, 1, or 2 representing --cluster-mode in cluster command
"""
# Get mmseqs dir
logger.info("\nRunning MMSeqs clustering...")
mmseqs_dir = os.path.join(
path_to_mmseqs[0 : path_to_mmseqs.index("/mmseqs")], "mmseqs/bin"
)
logger.info(f"Running mmseqs clustering from {mmseqs_dir}")
# Ensure MMseqs2 is in the PATH
ensure_mmseqs_in_path(mmseqs_dir)
# Define paths for MMseqs2
mmseqs_bin = "mmseqs" # Ensure this is in your PATH or provide the full path to mmseqs binary
# Create the output directory
os.makedirs(output_dir, exist_ok=True)
# Run MMseqs2 easy-cluster
cmd_easy_cluster = [
mmseqs_bin,
"easy-cluster",
input_fasta,
os.path.join(output_dir, "mmseqs"),
output_dir,
"--min-seq-id",
str(min_seq_id),
"-c",
str(c),
"--cov-mode",
str(cov_mode),
"--cluster-mode",
str(cluster_mode),
"--dbtype",
str(dbtype),
]
# Write the command to a log file
logger.info("\n\tCommand entered to MMSeqs2:")
logger.info("\t" + " ".join(cmd_easy_cluster) + "\n")
subprocess.run(cmd_easy_cluster, check=True)
logger.info(f"Clustering completed. Results are in {output_dir}")
def cluster_summary(clusters: pd.DataFrame):
"""
Summarizes how many clusters were formed, how big they are, etc ...
"""
grouped_clusters = (
clusters.groupby("representative seq_id")["member seq_id"]
.count()
.reset_index()
.rename(columns={"member seq_id": "member count"})
)
assert len(grouped_clusters) == len(
clusters["representative seq_id"].unique()
) # make sure number of cluster reps = # grouped clusters
total_seqs = sum(grouped_clusters["member count"])
logger.info(f"Created {len(grouped_clusters)} clusters of {total_seqs} sequences")
logger.info(
f"\t{len(grouped_clusters.loc[grouped_clusters['member count']==1])} clusters of size 1"
)
csize1_seqs = sum(
grouped_clusters[grouped_clusters["member count"] == 1]["member count"]
)
logger.info(
f"\t\tsequences: {csize1_seqs} ({round(100*csize1_seqs/total_seqs, 2)}%)"
)
logger.info(
f"\t{len(grouped_clusters.loc[grouped_clusters['member count']>1])} clusters of size > 1"
)
csizeg1_seqs = sum(
grouped_clusters[grouped_clusters["member count"] > 1]["member count"]
)
logger.info(
f"\t\tsequences: {csizeg1_seqs} ({round(100*csizeg1_seqs/total_seqs, 2)}%)"
)
logger.info(f"\tlargest cluster: {max(grouped_clusters['member count'])}")
logger.info("\nCluster size breakdown below...")
value_counts = (
grouped_clusters["member count"]
.value_counts()
.reset_index()
.rename(
columns={"member count": "cluster size (n_members)", "count": "n_clusters"}
)
)
logger.info(
value_counts.sort_values(
by="cluster size (n_members)", ascending=True
).to_string(index=False)
)
|