File size: 3,555 Bytes

5c34ec8
 
 
 
 
 
 
a887ffc
5c34ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a887ffc
5c34ec8
 
 
 
a887ffc
5c34ec8
a887ffc
 
 
5c34ec8
 
 
 
a887ffc
 
 
 
 
5c34ec8
 
 
 
a887ffc
 
 
 
 
5c34ec8
 
 
 
a887ffc
 
5c34ec8
a887ffc
 
 
5c34ec8
a887ffc
 
 
5c34ec8

import pandas as pd
import random
import matplotlib.pyplot as plt
import glob
import re
from pathlib import Path


def trim_sequence(seq: str, seq_flanked: str, total_len: int):
    """
    Return a substring of seq_flanked of length total_len that contains seq
    at a random valid position. Also returns (upstream, downstream).
    """
    i = seq_flanked.find(seq)
    if i < 0:
        raise ValueError(f"Motif '{seq}' not found in flanked sequence.")
    motif_len = len(seq)
    extra = total_len - motif_len
    left_avail = i
    right_avail = len(seq_flanked) - (i + motif_len)
    if extra > left_avail + right_avail:
        raise ValueError("Not enough flank to reach desired length.")
    # decide upstream bases
    min_left = max(0, extra - right_avail)
    max_left = min(extra, left_avail)
    upstream = random.randint(min_left, max_left)
    downstream = extra - upstream
    start = i - upstream
    end = i + motif_len + downstream
    return seq_flanked[start:end], upstream, downstream


def process_and_plot(input_csv: str, total_len: int, output_csv: Path, fig_dir: Path):
    df = pd.read_csv(input_csv)
    ups, downs, abs_pos, rel_pos = [], [], [], []
    trimmed_seqs = []
    for _, row in df.iterrows():
        trimmed, u, d = trim_sequence(row["seq"], row["seq_flanked"], total_len)
        trimmed_seqs.append(trimmed)
        ups.append(u)
        downs.append(d)
        abs_pos.append(u)
        rel_pos.append(u / (total_len - len(row["seq"])))
    df_out = df.copy()
    df_out["seq_trimmed"] = trimmed_seqs
    df_out["motif_abs_start"] = abs_pos
    df_out["motif_rel_pos"] = rel_pos
    df_out.to_csv(output_csv, index=False)

    basename = input_csv.stem
    # Absolute position histogram
    plt.figure(figsize=(6, 4))
    plt.hist(df_out["motif_abs_start"], bins=50, edgecolor="k")
    plt.title(f"{basename}: Absolute Motif Start")
    plt.xlabel("Start Index (nt)")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(fig_dir / f"{basename}_abs.png")
    plt.close()
    # Relative position histogram
    plt.figure(figsize=(6, 4))
    plt.hist(df_out["motif_rel_pos"], bins=50, edgecolor="k")
    plt.title(f"{basename}: Relative Motif Position")
    plt.xlabel("Relative Position")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(fig_dir / f"{basename}_rel.png")
    plt.close()


if __name__ == "__main__":
    # === USER SETTINGS ===
    PATTERN = "/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/tfclust/hg38/encRegTfbsClustered_hg38_chr*.csv"
    CHR_FILTER = re.compile(
        r"encRegTfbsClustered_hg38_chr([1-9]|1[0-9]|2[0-2]|X|Y)\.csv$"
    )
    DESIRED_LEN = 1000
    OUTPUT_DIR = Path("trimmed_csvs")
    FIG_DIR = Path("figures")
    # =====================

    OUTPUT_DIR.mkdir(exist_ok=True)
    FIG_DIR.mkdir(exist_ok=True)
    # Clear old figures
    for f in FIG_DIR.iterdir():
        if f.is_file():
            f.unlink()

    # Gather files and filter to pure chr1-22, X, Y
    all_files = glob.glob(PATTERN)
    files = [Path(f) for f in all_files if CHR_FILTER.match(Path(f).name)]
    if not files:
        print(f"No matching chr1-22, X, Y files found (pattern={PATTERN}).")
        exit(1)

    for infile in sorted(files):
        out_csv = OUTPUT_DIR / f"{infile.stem}_trimmed.csv"
        try:
            process_and_plot(infile, DESIRED_LEN, out_csv, FIG_DIR)
            print(f"Processed {infile.name} -> {out_csv.name}; figures in {FIG_DIR}/")
        except Exception as e:
            print(f"Error processing {infile.name}: {e}")