File size: 3,555 Bytes
5c34ec8 a887ffc 5c34ec8 a887ffc 5c34ec8 a887ffc 5c34ec8 a887ffc 5c34ec8 a887ffc 5c34ec8 a887ffc 5c34ec8 a887ffc 5c34ec8 a887ffc 5c34ec8 a887ffc 5c34ec8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | import pandas as pd
import random
import matplotlib.pyplot as plt
import glob
import re
from pathlib import Path
def trim_sequence(seq: str, seq_flanked: str, total_len: int):
"""
Return a substring of seq_flanked of length total_len that contains seq
at a random valid position. Also returns (upstream, downstream).
"""
i = seq_flanked.find(seq)
if i < 0:
raise ValueError(f"Motif '{seq}' not found in flanked sequence.")
motif_len = len(seq)
extra = total_len - motif_len
left_avail = i
right_avail = len(seq_flanked) - (i + motif_len)
if extra > left_avail + right_avail:
raise ValueError("Not enough flank to reach desired length.")
# decide upstream bases
min_left = max(0, extra - right_avail)
max_left = min(extra, left_avail)
upstream = random.randint(min_left, max_left)
downstream = extra - upstream
start = i - upstream
end = i + motif_len + downstream
return seq_flanked[start:end], upstream, downstream
def process_and_plot(input_csv: str, total_len: int, output_csv: Path, fig_dir: Path):
df = pd.read_csv(input_csv)
ups, downs, abs_pos, rel_pos = [], [], [], []
trimmed_seqs = []
for _, row in df.iterrows():
trimmed, u, d = trim_sequence(row["seq"], row["seq_flanked"], total_len)
trimmed_seqs.append(trimmed)
ups.append(u)
downs.append(d)
abs_pos.append(u)
rel_pos.append(u / (total_len - len(row["seq"])))
df_out = df.copy()
df_out["seq_trimmed"] = trimmed_seqs
df_out["motif_abs_start"] = abs_pos
df_out["motif_rel_pos"] = rel_pos
df_out.to_csv(output_csv, index=False)
basename = input_csv.stem
# Absolute position histogram
plt.figure(figsize=(6, 4))
plt.hist(df_out["motif_abs_start"], bins=50, edgecolor="k")
plt.title(f"{basename}: Absolute Motif Start")
plt.xlabel("Start Index (nt)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(fig_dir / f"{basename}_abs.png")
plt.close()
# Relative position histogram
plt.figure(figsize=(6, 4))
plt.hist(df_out["motif_rel_pos"], bins=50, edgecolor="k")
plt.title(f"{basename}: Relative Motif Position")
plt.xlabel("Relative Position")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(fig_dir / f"{basename}_rel.png")
plt.close()
if __name__ == "__main__":
# === USER SETTINGS ===
PATTERN = "/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/tfclust/hg38/encRegTfbsClustered_hg38_chr*.csv"
CHR_FILTER = re.compile(
r"encRegTfbsClustered_hg38_chr([1-9]|1[0-9]|2[0-2]|X|Y)\.csv$"
)
DESIRED_LEN = 1000
OUTPUT_DIR = Path("trimmed_csvs")
FIG_DIR = Path("figures")
# =====================
OUTPUT_DIR.mkdir(exist_ok=True)
FIG_DIR.mkdir(exist_ok=True)
# Clear old figures
for f in FIG_DIR.iterdir():
if f.is_file():
f.unlink()
# Gather files and filter to pure chr1-22, X, Y
all_files = glob.glob(PATTERN)
files = [Path(f) for f in all_files if CHR_FILTER.match(Path(f).name)]
if not files:
print(f"No matching chr1-22, X, Y files found (pattern={PATTERN}).")
exit(1)
for infile in sorted(files):
out_csv = OUTPUT_DIR / f"{infile.stem}_trimmed.csv"
try:
process_and_plot(infile, DESIRED_LEN, out_csv, FIG_DIR)
print(f"Processed {infile.name} -> {out_csv.name}; figures in {FIG_DIR}/")
except Exception as e:
print(f"Error processing {infile.name}: {e}")
|