File size: 3,555 Bytes
5c34ec8
 
 
 
 
 
 
a887ffc
5c34ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a887ffc
5c34ec8
 
 
 
a887ffc
5c34ec8
a887ffc
 
 
5c34ec8
 
 
 
a887ffc
 
 
 
 
5c34ec8
 
 
 
a887ffc
 
 
 
 
5c34ec8
 
 
 
a887ffc
 
5c34ec8
a887ffc
 
 
5c34ec8
a887ffc
 
 
5c34ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pandas as pd
import random
import matplotlib.pyplot as plt
import glob
import re
from pathlib import Path


def trim_sequence(seq: str, seq_flanked: str, total_len: int):
    """
    Return a substring of seq_flanked of length total_len that contains seq
    at a random valid position. Also returns (upstream, downstream).
    """
    i = seq_flanked.find(seq)
    if i < 0:
        raise ValueError(f"Motif '{seq}' not found in flanked sequence.")
    motif_len = len(seq)
    extra = total_len - motif_len
    left_avail = i
    right_avail = len(seq_flanked) - (i + motif_len)
    if extra > left_avail + right_avail:
        raise ValueError("Not enough flank to reach desired length.")
    # decide upstream bases
    min_left = max(0, extra - right_avail)
    max_left = min(extra, left_avail)
    upstream = random.randint(min_left, max_left)
    downstream = extra - upstream
    start = i - upstream
    end = i + motif_len + downstream
    return seq_flanked[start:end], upstream, downstream


def process_and_plot(input_csv: str, total_len: int, output_csv: Path, fig_dir: Path):
    df = pd.read_csv(input_csv)
    ups, downs, abs_pos, rel_pos = [], [], [], []
    trimmed_seqs = []
    for _, row in df.iterrows():
        trimmed, u, d = trim_sequence(row["seq"], row["seq_flanked"], total_len)
        trimmed_seqs.append(trimmed)
        ups.append(u)
        downs.append(d)
        abs_pos.append(u)
        rel_pos.append(u / (total_len - len(row["seq"])))
    df_out = df.copy()
    df_out["seq_trimmed"] = trimmed_seqs
    df_out["motif_abs_start"] = abs_pos
    df_out["motif_rel_pos"] = rel_pos
    df_out.to_csv(output_csv, index=False)

    basename = input_csv.stem
    # Absolute position histogram
    plt.figure(figsize=(6, 4))
    plt.hist(df_out["motif_abs_start"], bins=50, edgecolor="k")
    plt.title(f"{basename}: Absolute Motif Start")
    plt.xlabel("Start Index (nt)")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(fig_dir / f"{basename}_abs.png")
    plt.close()
    # Relative position histogram
    plt.figure(figsize=(6, 4))
    plt.hist(df_out["motif_rel_pos"], bins=50, edgecolor="k")
    plt.title(f"{basename}: Relative Motif Position")
    plt.xlabel("Relative Position")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(fig_dir / f"{basename}_rel.png")
    plt.close()


if __name__ == "__main__":
    # === USER SETTINGS ===
    PATTERN = "/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/tfclust/hg38/encRegTfbsClustered_hg38_chr*.csv"
    CHR_FILTER = re.compile(
        r"encRegTfbsClustered_hg38_chr([1-9]|1[0-9]|2[0-2]|X|Y)\.csv$"
    )
    DESIRED_LEN = 1000
    OUTPUT_DIR = Path("trimmed_csvs")
    FIG_DIR = Path("figures")
    # =====================

    OUTPUT_DIR.mkdir(exist_ok=True)
    FIG_DIR.mkdir(exist_ok=True)
    # Clear old figures
    for f in FIG_DIR.iterdir():
        if f.is_file():
            f.unlink()

    # Gather files and filter to pure chr1-22, X, Y
    all_files = glob.glob(PATTERN)
    files = [Path(f) for f in all_files if CHR_FILTER.match(Path(f).name)]
    if not files:
        print(f"No matching chr1-22, X, Y files found (pattern={PATTERN}).")
        exit(1)

    for infile in sorted(files):
        out_csv = OUTPUT_DIR / f"{infile.stem}_trimmed.csv"
        try:
            process_and_plot(infile, DESIRED_LEN, out_csv, FIG_DIR)
            print(f"Processed {infile.name} -> {out_csv.name}; figures in {FIG_DIR}/")
        except Exception as e:
            print(f"Error processing {infile.name}: {e}")