download structure

Browse files

Files changed (5) hide show

README.md +26 -0
dpacman/data/README.md +18 -0
dpacman/data/chip_atlas/full_data_loading.py +57 -32
dpacman/data/chip_atlas/smaller_data_loading.py +51 -31
dpacman/data/tfclust/download.py +360 -0

README.md CHANGED Viewed

@@ -1,3 +1,29 @@
 ---
 license: cc-by-nc-nd-4.0
 ---

 ---
 license: cc-by-nc-nd-4.0
 ---
+# Directory Structure
+```
+.
+├── README.md
+├── dpacman
+│   ├── data
+│   │   ├── README.md
+│   │   ├── chip_atlas
+│   │   │   ├── full_data_loading.py
+│   │   │   └── smaller_data_loading.py
+│   │   └── tfclust
+│   │       └── download.py
+│   └── data_files
+│       ├── processed
+│       │   └── tfclust
+│       └── raw
+│           ├── chip_atlas
+│           │   └── experimentList.tab
+│           └── tfclust
+│               ├── encRegTfbsClusteredWithCells.hg19.bed
+│               └── encRegTfbsClusteredWithCells.hg38.bed
+├── environment.yaml
+└── setup.py
+```

dpacman/data/README.md ADDED Viewed

	@@ -0,0 +1,18 @@

+# Data download directory
+## UCSC
+### Raw data download
+1. `encRegTfbsClusteredWithCells.hg38.bed.gz`
+```
+wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/encRegTfbsClustered/encRegTfbsClusteredWithCells.hg38.bed.gz
+gunzip encRegTfbsClusteredWithCells.hg38.bed.gz
+```
+2. `encRegTfbsClusteredWithCells.hg19.bed.gz`
+```
+wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/encRegTfbsClustered/encRegTfbsClusteredWithCells.hg19.bed.gz
+gunzip encRegTfbsClusteredWithCells.hg19.bed.gz
+```

dpacman/data/chip_atlas/full_data_loading.py CHANGED Viewed

@@ -2,71 +2,96 @@ import pandas as pd
 from pathlib import Path
 import subprocess
-#Read only cols 0–2, no header
 df = pd.read_csv(
     "experimentList.tab",
     sep="\t",
     header=None,
-    usecols=[0,1,2],
-    names=["exp_id","genome","assay_group"],
     engine="python",
     on_bad_lines="skip",
-    dtype=str
 )
-#Keep only known genome assemblies
 VALID_GENOMES = {
-    "hg19","hg38",
-    "mm9","mm10",
     "rn6",
-    "dm3","dm6",
-    "ce10","ce11",
-    "sacCer3"
 }
 df = df[df["genome"].isin(VALID_GENOMES)]
 print("Assemblies in filtered data:", df["genome"].unique())
-#Classify assay type
 def modality(track):
     t = track.lower()
-    if "atac"    in t: return "ATAC"
-    if "dnase"   in t: return "DNase"
-    if "bisulfite" in t or "methyl" in t: return "BS"
     return "ChIP"
 df["modality"] = df["assay_group"].apply(modality)
-#URL templates
 def make_urls(exp, genome, mod):
     urls = []
-    if mod in ("ChIP","ATAC","DNase"):
         urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bw/{exp}.bw")
-        for thr in ("05","10","20"):
-            urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bed{thr}/{exp}.{thr}.bed")
-            urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bb{thr}/{exp}.{thr}.bb")
     else:
-        urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/methyl/{exp}.methyl.bw")
-        urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/cover/{exp}.cover.bw")
-        for sub in ("hmr","pmd","hypermr"):
-            urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/{sub}/Bed/{exp}.{sub}.bed")
-            urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/{sub}/BigBed/{exp}.{sub}.bb")
     return urls
-#Write URL lists per genome
-urls_dir = Path("urls_by_genome"); urls_dir.mkdir(exist_ok=True)
 for genome, group in df.groupby("genome"):
     all_urls = []
     for _, row in group.iterrows():
         all_urls += make_urls(row.exp_id, genome, row.modality)
     uniq = sorted(set(all_urls))
-    (urls_dir/f"urls_{genome}.txt").write_text("\n".join(uniq))
     print(f"{genome}: {len(uniq)} URLs")
-#Download into raw/{genome}/
 for url_file in urls_dir.glob("urls_*.txt"):
-    genome = url_file.stem.split("_",1)[1]
-    dest = Path("raw")/genome
     dest.mkdir(parents=True, exist_ok=True)
     print(f"\nDownloading {genome} → {dest}/…")
-    subprocess.run(["wget","-nc","-i",str(url_file),"-P",str(dest)], check=True)
-print("Done! Check raw/{genome}/ for your files.")

 from pathlib import Path
 import subprocess
+# Read only cols 0–2, no header
 df = pd.read_csv(
     "experimentList.tab",
     sep="\t",
     header=None,
+    usecols=[0, 1, 2],
+    names=["exp_id", "genome", "assay_group"],
     engine="python",
     on_bad_lines="skip",
+    dtype=str,
 )
+# Keep only known genome assemblies
 VALID_GENOMES = {
+    "hg19",
+    "hg38",
+    "mm9",
+    "mm10",
     "rn6",
+    "dm3",
+    "dm6",
+    "ce10",
+    "ce11",
+    "sacCer3",
 }
 df = df[df["genome"].isin(VALID_GENOMES)]
 print("Assemblies in filtered data:", df["genome"].unique())
+# Classify assay type
 def modality(track):
     t = track.lower()
+    if "atac" in t:
+        return "ATAC"
+    if "dnase" in t:
+        return "DNase"
+    if "bisulfite" in t or "methyl" in t:
+        return "BS"
     return "ChIP"
 df["modality"] = df["assay_group"].apply(modality)
+# URL templates
 def make_urls(exp, genome, mod):
     urls = []
+    if mod in ("ChIP", "ATAC", "DNase"):
         urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bw/{exp}.bw")
+        for thr in ("05", "10", "20"):
+            urls.append(
+                f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bed{thr}/{exp}.{thr}.bed"
+            )
+            urls.append(
+                f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bb{thr}/{exp}.{thr}.bb"
+            )
     else:
+        urls.append(
+            f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/methyl/{exp}.methyl.bw"
+        )
+        urls.append(
+            f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/cover/{exp}.cover.bw"
+        )
+        for sub in ("hmr", "pmd", "hypermr"):
+            urls.append(
+                f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/{sub}/Bed/{exp}.{sub}.bed"
+            )
+            urls.append(
+                f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/{sub}/BigBed/{exp}.{sub}.bb"
+            )
     return urls
+# Write URL lists per genome
+urls_dir = Path("urls_by_genome")
+urls_dir.mkdir(exist_ok=True)
 for genome, group in df.groupby("genome"):
     all_urls = []
     for _, row in group.iterrows():
         all_urls += make_urls(row.exp_id, genome, row.modality)
     uniq = sorted(set(all_urls))
+    (urls_dir / f"urls_{genome}.txt").write_text("\n".join(uniq))
     print(f"{genome}: {len(uniq)} URLs")
+# Download into raw/{genome}/
 for url_file in urls_dir.glob("urls_*.txt"):
+    genome = url_file.stem.split("_", 1)[1]
+    dest = Path("raw") / genome
     dest.mkdir(parents=True, exist_ok=True)
     print(f"\nDownloading {genome} → {dest}/…")
+    subprocess.run(["wget", "-nc", "-i", str(url_file), "-P", str(dest)], check=True)
+print("Done! Check raw/{genome}/ for your files.")

dpacman/data/chip_atlas/smaller_data_loading.py CHANGED Viewed

@@ -12,7 +12,18 @@ from tqdm import tqdm
 TARGET_REGIONS = 200_000
 # Assemblies to include
-ASSEMBLIES = ["hg19","hg38","mm9","mm10","rn6","dm3","dm6","ce10","ce11","sacCer3"]
 # How many experiments to sample at most per protein (tune up/down)
 MAX_EXPS_PER_PROTEIN = 50
@@ -25,64 +36,67 @@ WORKDIR = Path("chip_atlas_fetch")
 WORKDIR.mkdir(exist_ok=True)
 LIST_DIR = WORKDIR / "lists"
 LIST_DIR.mkdir(exist_ok=True)
-DL_DIR   = WORKDIR / "downloads"
 DL_DIR.mkdir(exist_ok=True)
 # ─── HELPERS ──────────────────────────────────────────────────────────────────
 def download_and_extract(url, extract_to: Path):
     """Fetch a ZIP and unzip it."""
     local = extract_to / Path(url).name
     if not local.exists():
         print(f"→ Downloading {url}")
-        resp = requests.get(url, stream=True); resp.raise_for_status()
         with open(local, "wb") as f:
-            for chunk in resp.iter_content(1<<20):
                 f.write(chunk)
     with zipfile.ZipFile(local, "r") as z:
         z.extractall(extract_to)
 # ─── 1) GET MASTER LISTS ────────────────────────────────────────────────────
 print("1) Fetching master file & experiment lists…")
-FILELIST_URL      = "https://dbarchive.biosciencedbc.jp/data/chip-atlas/LATEST/chip_atlas_file_list.zip"
-EXPERIMENTLIST_URL= "https://dbarchive.biosciencedbc.jp/data/chip-atlas/LATEST/chip_atlas_experiment_list.zip"
 download_and_extract(FILELIST_URL, LIST_DIR)
 download_and_extract(EXPERIMENTLIST_URL, LIST_DIR)
-filelist_txt      = LIST_DIR / "chip_atlas_file_list.csv"
-experiment_txt    = LIST_DIR / "chip_atlas_experiment_list.csv"
 # ─── 2) PARSE EXPERIMENT METADATA ────────────────────────────────────────────
 print("2) Parsing experiment → protein lookup…")
 exp_df = pd.read_csv(
     experiment_txt,
-    sep=None,            # let python engine guess (comma vs. tab)
-    engine="python",     # required when sep=None
-    encoding="latin1"    # to avoid UnicodeDecodeErrors
 )
 print("Columns in experiment list:", exp_df.columns.tolist())
-exp_df = (
-    exp_df
-    .loc[:, ['Experimental ID', 'Genome assembly', 'Antigen']]
-    .rename(columns={
-        'Experimental ID': 'exp_id',
-        'Genome assembly': 'genome',
-        'Antigen': 'assay_group'
-    })
 )
-exp_df['protein'] = exp_df['assay_group'].str.replace(r'_ChIP.*', '', regex=True)
 # Finally, filter to only the assemblies you care about:
-exp_df = exp_df[exp_df['genome'].isin(ASSEMBLIES)]
 # build lookup
-exp_to_genome  = exp_df.set_index("exp_id")["genome"].to_dict()
 exp_to_protein = exp_df.set_index("exp_id")["protein"].to_dict()
 # ─── 3) BUILD URL LIST DIRECTLY ───────────────────────────────────────────────
@@ -93,11 +107,12 @@ urls_by_exp = {}
 for exp, genome in exp_to_genome.items():
     urls_by_exp[exp] = [
         f"{BASE}/data/{genome}/eachData/bw/{exp}.bw",
-        f"{BASE}/data/{genome}/eachData/bed10/{exp}.10.bed"
     ]
 # bucket experiments by protein
 from collections import defaultdict
 prot_exps = defaultdict(list)
 for exp, prot in exp_to_protein.items():
     if exp in urls_by_exp:
@@ -127,14 +142,19 @@ print(f" → Wrote {len(final_urls):,} URLs to {url_list_file}")
 # ─── 4) PARALLEL DOWNLOAD VIA aria2c ─────────────────────────────────────────
 print("4) Downloading with aria2c…")
-subprocess.run([
-    "aria2c",
-    f"-x{ARIA2C_CONN}",
-    "--dir", str(DL_DIR),
-    "--input-file", str(url_list_file),
-    "--auto-file-renaming=false",
-    "--allow-overwrite=true"
-], check=True)
 print("✅ Finished downloading all selected files.")
 print(f"Your files are in: {DL_DIR.resolve()}")

 TARGET_REGIONS = 200_000
 # Assemblies to include
+ASSEMBLIES = [
+    "hg19",
+    "hg38",
+    "mm9",
+    "mm10",
+    "rn6",
+    "dm3",
+    "dm6",
+    "ce10",
+    "ce11",
+    "sacCer3",
+]
 # How many experiments to sample at most per protein (tune up/down)
 MAX_EXPS_PER_PROTEIN = 50
 WORKDIR.mkdir(exist_ok=True)
 LIST_DIR = WORKDIR / "lists"
 LIST_DIR.mkdir(exist_ok=True)
+DL_DIR = WORKDIR / "downloads"
 DL_DIR.mkdir(exist_ok=True)
 # ─── HELPERS ──────────────────────────────────────────────────────────────────
 def download_and_extract(url, extract_to: Path):
     """Fetch a ZIP and unzip it."""
     local = extract_to / Path(url).name
     if not local.exists():
         print(f"→ Downloading {url}")
+        resp = requests.get(url, stream=True)
+        resp.raise_for_status()
         with open(local, "wb") as f:
+            for chunk in resp.iter_content(1 << 20):
                 f.write(chunk)
     with zipfile.ZipFile(local, "r") as z:
         z.extractall(extract_to)
 # ─── 1) GET MASTER LISTS ────────────────────────────────────────────────────
 print("1) Fetching master file & experiment lists…")
+FILELIST_URL = (
+    "https://dbarchive.biosciencedbc.jp/data/chip-atlas/LATEST/chip_atlas_file_list.zip"
+)
+EXPERIMENTLIST_URL = "https://dbarchive.biosciencedbc.jp/data/chip-atlas/LATEST/chip_atlas_experiment_list.zip"
 download_and_extract(FILELIST_URL, LIST_DIR)
 download_and_extract(EXPERIMENTLIST_URL, LIST_DIR)
+filelist_txt = LIST_DIR / "chip_atlas_file_list.csv"
+experiment_txt = LIST_DIR / "chip_atlas_experiment_list.csv"
 # ─── 2) PARSE EXPERIMENT METADATA ────────────────────────────────────────────
 print("2) Parsing experiment → protein lookup…")
 exp_df = pd.read_csv(
     experiment_txt,
+    sep=None,  # let python engine guess (comma vs. tab)
+    engine="python",  # required when sep=None
+    encoding="latin1",  # to avoid UnicodeDecodeErrors
 )
 print("Columns in experiment list:", exp_df.columns.tolist())
+exp_df = exp_df.loc[:, ["Experimental ID", "Genome assembly", "Antigen"]].rename(
+    columns={
+        "Experimental ID": "exp_id",
+        "Genome assembly": "genome",
+        "Antigen": "assay_group",
+    }
 )
+exp_df["protein"] = exp_df["assay_group"].str.replace(r"_ChIP.*", "", regex=True)
 # Finally, filter to only the assemblies you care about:
+exp_df = exp_df[exp_df["genome"].isin(ASSEMBLIES)]
 # build lookup
+exp_to_genome = exp_df.set_index("exp_id")["genome"].to_dict()
 exp_to_protein = exp_df.set_index("exp_id")["protein"].to_dict()
 # ─── 3) BUILD URL LIST DIRECTLY ───────────────────────────────────────────────
 for exp, genome in exp_to_genome.items():
     urls_by_exp[exp] = [
         f"{BASE}/data/{genome}/eachData/bw/{exp}.bw",
+        f"{BASE}/data/{genome}/eachData/bed10/{exp}.10.bed",
     ]
 # bucket experiments by protein
 from collections import defaultdict
 prot_exps = defaultdict(list)
 for exp, prot in exp_to_protein.items():
     if exp in urls_by_exp:
 # ─── 4) PARALLEL DOWNLOAD VIA aria2c ─────────────────────────────────────────
 print("4) Downloading with aria2c…")
+subprocess.run(
+    [
+        "aria2c",
+        f"-x{ARIA2C_CONN}",
+        "--dir",
+        str(DL_DIR),
+        "--input-file",
+        str(url_list_file),
+        "--auto-file-renaming=false",
+        "--allow-overwrite=true",
+    ],
+    check=True,
+)
 print("✅ Finished downloading all selected files.")
 print(f"Your files are in: {DL_DIR.resolve()}")

dpacman/data/tfclust/download.py ADDED Viewed

	@@ -0,0 +1,360 @@

+import requests
+from time import sleep
+import json
+import logging
+import multiprocessing
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import os
+import pandas as pd
+def get_all_tfs(genome: str = "hg38"):
+    """
+    Get all the transcription factors from the appropriate encRegTfbsClusteredWithCells.genome.bed file.
+    Available in data_files/raw/tfclust for genomes hg38 and hg19
+    """
+    # Read raw file
+    raw_data = pd.read_csv(
+        "../data_files/encode3TfbsClusteredWithCells.bed", sep="\t", header=None
+    )
+    raw_data.columns = ["chrom", "start", "end", "tf_name", "score", "cell_line"]
+    # Extract all unique TF names
+    all_tfs = encode_raw["tf_name"].unique().tolist()
+    logging.info(f"Found {len(all_tfs)} transcription factors in genome {genome}.")
+    return all_tfs
+def get_all_chroms(genome: str = "hg38"):
+    """
+    Fetch all chromosome names for a genome.
+    Note: some chromosomes are in unexpected formats (e.g. there is 'chr15', but also 'chr15_ML143371v1_fix')
+    """
+    url = f"https://api.genome.ucsc.edu/list/chromosomes?genome={genome}"
+    try:
+        r = requests.get(url)
+        r.raise_for_status()
+    except:
+        raise ValueError(f"Failed to fetch all chromosomes for genome {genome}")
+    all_chroms = [chrom for chrom in r.json()["chromosomes"]]
+    logging.info(f"Found {len(all_chroms)} chromosomes in genome {genome}.")
+    return all_chroms
+def fetch_tfbs_track(chrom: str, genome: str = "hg38"):
+    """
+    Fetch raw data from the track encRegTfbsClustered.
+    Returns json data for the specified chromosome, where key information appears as follows:
+    "encRegTfbsClustered": [
+        {
+            "bin": 585,
+            "chrom": "chr1",
+            "chromStart": 9917,
+            "chromEnd": 10247,
+            "name": "NUFIP1",
+            "score": 680,
+            "sourceCount": 1,
+            "sourceIds": "1063",
+            "sourceScores": "680"
+        },...
+        ]
+    """
+    params = {"genome": genome, "track": "encRegTfbsClustered", "chrom": chrom}
+    url = f"https://api.genome.ucsc.edu/getData/track?genome={params['genome']};track={params['track']};chrom={params['chrom']}"
+    try:
+        r = requests.get(url)
+        r.raise_for_status()
+    except:
+        raise ValueError(
+            f"Failed to fetch encRegTfbsClustered for {chrom} in genome {genome}"
+        )
+    # Extract the output and save it
+    json_out_dir = f"../data_files/raw/tfclust/encRegTfbsClustered_data/{genome}"
+    os.makedirs(json_out_dir, exist_ok=True)
+    # Save it
+    json_output = r.json()
+    with open(
+        f"{json_out_dir}/{params['genome']}_{params['track']}_{params['chrom']}.json",
+        "w",
+    ) as f:
+        json.dump(json_output, f, indent=4)
+    logging.info(
+        f"Saved to {json_out_dir}/{params['genome']}_{params['track']}_{params['chrom']}.json"
+    )
+    return json_output
+def get_sequence(
+    chrom: str,
+    start: int,
+    end: int,
+    flank5: int = 0,
+    flank3: int = 0,
+    genome: str = "hg38",
+):
+    """
+    Given genome, start position, end position, chromosome, and desired flank size, extract the raw DNA sequence
+    """
+    new_start = max(0, start - flank)
+    new_end = end + flank
+    region = f"{chrom}:{new_start}-{new_end}"
+    url = f"https://api.genome.ucsc.edu/getData/sequence?genome={genome};chrom={chrom};start={new_start};end={new_end}"
+    try:
+        r = requests.get(url)
+        r.raise_for_status()
+    except:
+        raise ValueError(f"Failed to fetch sequence for {region} in genome {genome}")
+    results_dict = {
+        "chromStart": new_start,
+        "chromEnd": new_end,
+        "seq": r.json()["dna"],
+    }
+    return results_dict
+def extract_tfbs_with_context(
+    genome: str = "hg38",
+    flank5: int = 500,
+    flank3: int = 500,
+    control_run: bool = True,  # if there's a flank, whether to also run without flank
+    out_dir: str = "../data_files/processed/tfclust",
+    allowed_tfs: list = None,  # e.g., ['CTCF', 'MAX']
+    chroms: list = None,
+):
+    """
+    Loop through raw downloads and extract TF binding sites (bs) with flanks
+    Builds a DataFrame with all the available data for each TF. Columns = ["bin", "chrom", "chromStart", "chromEnd", "name", "score", "scoreCount", "sourceIds", "sourceScores", "seq", "seq_flanked", "chromStart_flanked", "chromEnd_flanked"]
+    """
+    # Prepare to save output
+    os.makedirs(out_dir, exist_ok=True)
+    # Get chromosomes
+    if chroms is None:
+        logging.info(
+            "No chromosomes provided, fetching all chromosomes for the given genome..."
+        )
+        chroms = get_all_chroms(genome)
+    count = 0
+    # Initialize the final DF
+    results_cols = [
+        "bin",
+        "chrom",
+        "chromStart",
+        "chromEnd",
+        "name",
+        "score",
+        "scoreCount",
+        "sourceIds",
+        "sourceScores",
+        "seq",
+        "seq_flanked",
+        "chromStart_flanked",
+        "chromEnd_flanked",
+        "flank5",
+        "flank3",
+    ]
+    results_init = pd.DataFrame(columns=results_cols)
+    # Make a list of the types of runs we need
+    queries = [{"flank5": flank5, "flank3": flank3}]
+    if not ((flank5 == 0) and (flank3 == 0) and control_run):
+        queries.append({"type": "control", "flank5": 0, "flank3": 0})
+        queries[0]["type"] = "flank"
+    elif (flank5 == 0) and (flank3 == 0):
+        queries[0]["type"] = "control"
+    # For each chromosome, download the encRegTfbsClustered track, extract the features, and fetch the sequences
+    # Loop through chroms
+    for chrom in chroms:
+        results_init.to_csv(
+            f"{out_dir}/encRegTfbsClustered_{genome}_{chrom}.csv", index=False
+        )
+        logging.info(f"Fetching {chrom}...")
+        # Fetch the data json (has start and end positions in the chrom, but not the sequence)
+        try:
+            data = fetch_tfbs_track(chrom, genome=genome)
+            logging.info(f"  → Fetched {chrom} successfully")
+            features = data.get("encRegTfbsClustered", {})
+            logging.info(f"  → Found {len(features)} features")
+        except Exception as e:
+            logging.info(f"  Failed to fetch {chrom}: {e}")
+            continue
+        # Get the sequences of the DNA binding sites
+        for feature_no, feature in enumerate(features):
+            # Initialize new results row
+            new_row = {}
+            # Check if tf is valid
+            tf_name = feature.get("name", "UnknownTF")
+            if allowed_tfs and tf_name not in allowed_tfs:
+                continue
+            else:
+                logging.warning(f"TF name {tf_name} not in allowed_tfs. Skipping.")
+            # Make sure the chromosomes match and we have the right sequence!
+            assert (
+                feature["chrom"] == chrom
+            ), f"Chromosome mismatch: {feature['chrom']} != {chrom}"
+            # Add all the cols already in the json, add
+            for c in results_cols:
+                if c in feature:
+                    new_row[c] = feature[c]
+            ### Extract sequence
+            start = feature["chromStart"]
+            end = feature["chromEnd"]
+            for query in queries:
+                try:
+                    results_dict = get_sequence(
+                        chrom,
+                        start,
+                        end,
+                        flank5=query["flank5"],
+                        flank3=query["flank3"],
+                        genome=genome,
+                    )
+                    logging.info(
+                        f" Success on feat. {feature_no} {chrom}:{start}-{end}, type {query['type']}"
+                    )
+                    # Add the returned info
+                    if type == "control":
+                        new_row["seq"] = results_dict["seq"]
+                    else:
+                        new_row["seq_flanked"] = results_dict["seq"]
+                        new_row["chromStart_flanked"] = results_dict["chromStart"]
+                        new_row["chromEnd_flanked"] = results_dict["chromEnd"]
+                        new_row["flank5"] = flank5
+                        new_row["flank3"] = flank3
+                    count += 1
+                except Exception as e:
+                    logging.info(
+                        f"  Skipped feat. {feature_no} {chrom}:{start}-{end} due to error: {e}"
+                    )
+                    continue
+                sleep(0.05)  # Stay within UCSC's 20 req/sec rate limit
+            # Fill out any blank columns
+            for c in results_cols:
+                if c not in new_row:
+                    new_row[c] = None
+            new_row_df = pd.DataFrame(data=new_row, columns=results_cols)
+            if new_row_df["seq"] is not None:
+                new_row_df.to_csv(
+                    f"{out_dir}/encRegTfbsClustered_{chrom}.csv",
+                    mode="a",
+                    index=False,
+                    header=False,
+                )
+                logging.info(
+                    f"Wrote new row to {out_dir}/encRegTfbsClustered_{chrom}.csv"
+                )
+    logging.info(f"Done. Wrote {count} sequences to {output}")
+# Thread function for one chromosome
+def process_chrom(
+    chrom: str = "chr1",
+    genome: str = "hg38",
+    flank5: int = 500,
+    flank3: int = 500,
+    control_run: bool = True,
+    out_dir: str = "../data_files/processed/tfclust",
+    allowed_tfs: list = None,
+    max_cpu_frac: float = None,
+):
+    """
+    Called within parallel method to strat a thread
+    """
+    logging.info(f"Starting thread for {chrom}")
+    try:
+        extract_tfbs_with_context(
+            genome=genome,
+            flank5=flank5,
+            flank3=flank3,
+            control_run=control_run,
+            out_dir=out_dir,
+            allowed_tfs=allowed_tfs,
+            chroms=[chrom],  # important: wrap in list
+        )
+        logging.info(f"Finished {chrom}")
+    except Exception as e:
+        logging.error(f"Error processing {chrom}: {e}")
+def parallel_extract_tfbs_with_context(
+    genome: str = "hg38",
+    flank5: int = 500,
+    flank3: int = 500,
+    control_run: bool = True,
+    out_dir: str = "../data_files/processed/tfclust",
+    allowed_tfs: list = None,
+    chroms: list = None,
+    max_cpu_frac: float = None,
+):
+    """
+    Call extract_tfbs_with_context() using multithreading, one thread per chromosome.
+    """
+    # Get all chromosomes if not supplied
+    if chroms is None:
+        chroms = get_all_chroms(genome=genome)
+    # Determine max workers
+    max_workers = len(chroms)
+    max_available = int(multiprocessing.cpu_count())
+    if max_cpu_frac is not None:
+        max_available = int(multiprocessing.cpu_count() * max_cpu_frac)
+    max_workers = min(max_workers, max_available)
+    logging.info(
+        f"{max_available} CPU cores available. Using {max_workers} threads for genome {genome}..."
+    )
+    # Launch threads
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {executor.submit(process_chrom, chrom): chrom for chrom in chroms}
+        for future in as_completed(futures):
+            chrom = futures[future]
+            try:
+                future.result()
+            except Exception as e:
+                logging.error(f"Chromosome {chrom} raised an exception: {e}")
+def main():
+    genomes = ["hg38", "hg19"]
+    frac_per_genome = round(1 / len(genomes), 1)
+    for genome in genomes:
+        all_chroms = get_all_chroms(genome=genome)
+        parallel_extract_tfbs_with_context(
+            genome=genome,
+            flank5=500,
+            flank3=500,
+            control_run=True,  # if there's a flank, whether to also run without flank
+            out_dir=f"../data_files/processed/tfclust/{genome}",
+            allowed_tfs=None,  # e.g., ['CTCF', 'MAX']
+            chroms=None,
+            max_cpu_frac=frac_per_genome,
+        )
+if __name__ == "__main__":
+    logger = logging.getLogger(__name__)
+    logging.basicConfig(
+        filename="download.log",
+        encoding="utf-8",
+        level=logging.DEBUG,
+        filemode="w",
+    )