ChatterjeeLab
/

DPACMAN

Model card Files Files and versions

xet

Community

svincoff commited on Jun 17, 2025

Commit

dca7b00

1 Parent(s): fc82077

download working

Browse files

Files changed (3) hide show

.gitignore +2 -1
README.md +48 -18
dpacman/data/tfclust/download.py +316 -214

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 dpacman/data_files
-dpacman/data/tfclust/*.log

 dpacman/data_files
+dpacman/data/tfclust/*.log
+dpacman/data/tfclust/temp.py

README.md CHANGED Viewed

@@ -8,22 +8,52 @@ license: cc-by-nc-nd-4.0
 .
 ├── README.md
 ├── dpacman
-│   ├── data
-│   │   ├── README.md
-│   │   ├── chip_atlas
-│   │   │   ├── full_data_loading.py
-│   │   │   └── smaller_data_loading.py
-│   │   └── tfclust
-│   │       └── download.py
-│   └── data_files
-│       ├── processed
-│       │   └── tfclust
-│       └── raw
-│           ├── chip_atlas
-│           │   └── experimentList.tab
-│           └── tfclust
-│               ├── encRegTfbsClusteredWithCells.hg19.bed
-│               └── encRegTfbsClusteredWithCells.hg38.bed
 ├── environment.yaml
-└── setup.py
-```

 .
 ├── README.md
 ├── dpacman
+│   ├── data
+│   │   ├── README.md
+│   │   ├── chip_atlas
+│   │   │   ├── full_data_loading.py
+│   │   │   └── smaller_data_loading.py
+│   │   └── tfclust
+│   │       ├── api_download.py
+│   │       ├── download.log
+│   │       ├── download.py
+│   │       ├── hg38_success_download.log
+│   │       └── temp.py
+│   └── data_files
+│       ├── processed
+│       │   └── tfclust
+│       │       ├── hg19
+│       │       │   ├── encRegTfbsClustered_hg19_chr1.csv
+│       │       │   └── logs
+│       │       │       ├── completed.txt
+│       │       │       ├── completed_worker_0.txt
+│       │       │       ├── worker_0.log
+│       │       └── hg38
+│       │           ├── encRegTfbsClustered_hg38_chr1.csv
+│       │           └── logs
+│       │               ├── completed.txt
+│       │               ├── completed_worker_0.txt
+│       │               ├── worker_0.log
+│       └── raw
+│           ├── chip_atlas
+│           │   └── experimentList.tab
+│           ├── genomes
+│           │   ├── hg19
+│           │   │   ├── hg19_chr1.json
+│           │   └── hg38
+│           │       ├── hg38_chr1.json
+│           └── tfclust
+│               ├── encRegTfbsClusteredWithCells.hg19.bed
+│               ├── encRegTfbsClusteredWithCells.hg38.bed
+│               └── encRegTfbsClustered_data
+│                   ├── hg19
+│                   │   ├── hg19_encRegTfbsClustered_chr1.json
+│                   └── hg38
+│                       ├── hg38_encRegTfbsClustered_chr1.json
 ├── environment.yaml
+├── setup.py
+└── tree_output.txt
+```
+20 directories, 3089 files
+In `data_files` subfolders, only representative files for certain chromosomes are shown. In reality, any file that contains the substring "_chr" exists for every chromosome in that genome. Genome hg38 has 711 chromosomes. Genome hg19 has 298 chromosomes.

dpacman/data/tfclust/download.py CHANGED Viewed

@@ -1,50 +1,90 @@
 import requests
-from time import sleep
 import json
-import logging
 import multiprocessing
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import os
-import pandas as pd
 def get_all_tfs(genome: str = "hg38"):
-    """
-    Get all the transcription factors from the appropriate encRegTfbsClusteredWithCells.genome.bed file.
-    Available in data_files/raw/tfclust for genomes hg38 and hg19
-    """
-    # Read raw file
     raw_data = pd.read_csv(
-        "../data_files/encode3TfbsClusteredWithCells.bed", sep="\t", header=None
     )
     raw_data.columns = ["chrom", "start", "end", "tf_name", "score", "cell_line"]
-    # Extract all unique TF names
-    all_tfs = encode_raw["tf_name"].unique().tolist()
     logging.info(f"Found {len(all_tfs)} transcription factors in genome {genome}.")
     return all_tfs
-def get_all_chroms(genome: str = "hg38"):
     """
     Fetch all chromosome names for a genome.
     Note: some chromosomes are in unexpected formats (e.g. there is 'chr15', but also 'chr15_ML143371v1_fix')
     """
     url = f"https://api.genome.ucsc.edu/list/chromosomes?genome={genome}"
     try:
         r = requests.get(url)
         r.raise_for_status()
     except:
-        raise ValueError(f"Failed to fetch all chromosomes for genome {genome}")
-    all_chroms = [chrom for chrom in r.json()["chromosomes"]]
-    logging.info(f"Found {len(all_chroms)} chromosomes in genome {genome}.")
     return all_chroms
-def fetch_tfbs_track(chrom: str, genome: str = "hg38"):
     """
     Fetch raw data from the track encRegTfbsClustered.
     Returns json data for the specified chromosome, where key information appears as follows:
@@ -63,87 +103,106 @@ def fetch_tfbs_track(chrom: str, genome: str = "hg38"):
         ]
     """
     params = {"genome": genome, "track": "encRegTfbsClustered", "chrom": chrom}
-    url = f"https://api.genome.ucsc.edu/getData/track?genome={params['genome']};track={params['track']};chrom={params['chrom']}"
-    try:
-        r = requests.get(url)
-        r.raise_for_status()
-    except:
-        raise ValueError(
-            f"Failed to fetch encRegTfbsClustered for {chrom} in genome {genome}"
-        )
-    # Extract the output and save it
-    json_out_dir = f"../data_files/raw/tfclust/encRegTfbsClustered_data/{genome}"
-    os.makedirs(json_out_dir, exist_ok=True)
-    # Save it
-    json_output = r.json()
-    with open(
-        f"{json_out_dir}/{params['genome']}_{params['track']}_{params['chrom']}.json",
-        "w",
-    ) as f:
-        json.dump(json_output, f, indent=4)
-    logging.info(
-        f"Saved to {json_out_dir}/{params['genome']}_{params['track']}_{params['chrom']}.json"
-    )
     return json_output
 def get_sequence(
-    chrom: str,
     start: int,
     end: int,
     flank5: int = 0,
     flank3: int = 0,
     genome: str = "hg38",
 ):
     """
-    Given genome, start position, end position, chromosome, and desired flank size, extract the raw DNA sequence
-    """
-    new_start = max(0, start - flank)
-    new_end = end + flank
-    region = f"{chrom}:{new_start}-{new_end}"
-    url = f"https://api.genome.ucsc.edu/getData/sequence?genome={genome};chrom={chrom};start={new_start};end={new_end}"
-    try:
-        r = requests.get(url)
-        r.raise_for_status()
-    except:
-        raise ValueError(f"Failed to fetch sequence for {region} in genome {genome}")
     results_dict = {
         "chromStart": new_start,
         "chromEnd": new_end,
-        "seq": r.json()["dna"],
     }
     return results_dict
 def extract_tfbs_with_context(
-    genome: str = "hg38",
-    flank5: int = 500,
-    flank3: int = 500,
     control_run: bool = True,  # if there's a flank, whether to also run without flank
-    out_dir: str = "../data_files/processed/tfclust",
-    allowed_tfs: list = None,  # e.g., ['CTCF', 'MAX']
     chroms: list = None,
 ):
     """
-    Loop through raw downloads and extract TF binding sites (bs) with flanks
-    Builds a DataFrame with all the available data for each TF. Columns = ["bin", "chrom", "chromStart", "chromEnd", "name", "score", "scoreCount", "sourceIds", "sourceScores", "seq", "seq_flanked", "chromStart_flanked", "chromEnd_flanked"]
     """
-    # Prepare to save output
-    os.makedirs(out_dir, exist_ok=True)
-    # Get chromosomes
     if chroms is None:
-        logging.info(
-            "No chromosomes provided, fetching all chromosomes for the given genome..."
         )
-        chroms = get_all_chroms(genome)
-    count = 0
     # Initialize the final DF
     results_cols = [
         "bin",
@@ -163,7 +222,7 @@ def extract_tfbs_with_context(
         "flank3",
     ]
     results_init = pd.DataFrame(columns=results_cols)
     # Make a list of the types of runs we need
     queries = [{"flank5": flank5, "flank3": flank3}]
     if not ((flank5 == 0) and (flank3 == 0) and control_run):
@@ -171,22 +230,51 @@ def extract_tfbs_with_context(
         queries[0]["type"] = "flank"
     elif (flank5 == 0) and (flank3 == 0):
         queries[0]["type"] = "control"
-    # For each chromosome, download the encRegTfbsClustered track, extract the features, and fetch the sequences
-    # Loop through chroms
-    for chrom in chroms:
         results_init.to_csv(
-            f"{out_dir}/encRegTfbsClustered_{genome}_{chrom}.csv", index=False
         )
-        logging.info(f"Fetching {chrom}...")
         # Fetch the data json (has start and end positions in the chrom, but not the sequence)
         try:
-            data = fetch_tfbs_track(chrom, genome=genome)
-            logging.info(f"  → Fetched {chrom} successfully")
             features = data.get("encRegTfbsClustered", {})
-            logging.info(f"  → Found {len(features)} features")
         except Exception as e:
-            logging.info(f"  Failed to fetch {chrom}: {e}")
             continue
         # Get the sequences of the DNA binding sites
@@ -197,9 +285,8 @@ def extract_tfbs_with_context(
             # Check if tf is valid
             tf_name = feature.get("name", "UnknownTF")
             if allowed_tfs and tf_name not in allowed_tfs:
                 continue
-            else:
-                logging.warning(f"TF name {tf_name} not in allowed_tfs. Skipping.")
             # Make sure the chromosomes match and we have the right sequence!
             assert (
                 feature["chrom"] == chrom
@@ -215,146 +302,161 @@ def extract_tfbs_with_context(
             end = feature["chromEnd"]
             for query in queries:
-                try:
-                    results_dict = get_sequence(
-                        chrom,
-                        start,
-                        end,
-                        flank5=query["flank5"],
-                        flank3=query["flank3"],
-                        genome=genome,
-                    )
-                    logging.info(
-                        f" Success on feat. {feature_no} {chrom}:{start}-{end}, type {query['type']}"
-                    )
-                    # Add the returned info
-                    if type == "control":
-                        new_row["seq"] = results_dict["seq"]
-                    else:
-                        new_row["seq_flanked"] = results_dict["seq"]
-                        new_row["chromStart_flanked"] = results_dict["chromStart"]
-                        new_row["chromEnd_flanked"] = results_dict["chromEnd"]
-                        new_row["flank5"] = flank5
-                        new_row["flank3"] = flank3
-                    count += 1
-                except Exception as e:
-                    logging.info(
-                        f"  Skipped feat. {feature_no} {chrom}:{start}-{end} due to error: {e}"
-                    )
-                    continue
-                sleep(0.05)  # Stay within UCSC's 20 req/sec rate limit
-            # Fill out any blank columns
-            for c in results_cols:
-                if c not in new_row:
-                    new_row[c] = None
-            new_row_df = pd.DataFrame(data=new_row, columns=results_cols)
-            if new_row_df["seq"] is not None:
-                new_row_df.to_csv(
-                    f"{out_dir}/encRegTfbsClustered_{chrom}.csv",
-                    mode="a",
-                    index=False,
-                    header=False,
-                )
-                logging.info(
-                    f"Wrote new row to {out_dir}/encRegTfbsClustered_{chrom}.csv"
                 )
-    logging.info(f"Done. Wrote {count} sequences to {output}")
-# Thread function for one chromosome
-def process_chrom(
-    chrom: str = "chr1",
-    genome: str = "hg38",
-    flank5: int = 500,
-    flank3: int = 500,
-    control_run: bool = True,
-    out_dir: str = "../data_files/processed/tfclust",
-    allowed_tfs: list = None,
-    max_cpu_frac: float = None,
-):
     """
-    Called within parallel method to strat a thread
     """
-    logging.info(f"Starting thread for {chrom}")
-    try:
-        extract_tfbs_with_context(
-            genome=genome,
-            flank5=flank5,
-            flank3=flank3,
-            control_run=control_run,
-            out_dir=out_dir,
-            allowed_tfs=allowed_tfs,
-            chroms=[chrom],  # important: wrap in list
-        )
-        logging.info(f"Finished {chrom}")
-    except Exception as e:
-        logging.error(f"Error processing {chrom}: {e}")
-def parallel_extract_tfbs_with_context(
-    genome: str = "hg38",
-    flank5: int = 500,
-    flank3: int = 500,
-    control_run: bool = True,
-    out_dir: str = "../data_files/processed/tfclust",
-    allowed_tfs: list = None,
-    chroms: list = None,
-    max_cpu_frac: float = None,
-):
     """
-    Call extract_tfbs_with_context() using multithreading, one thread per chromosome.
     """
-    # Get all chromosomes if not supplied
-    if chroms is None:
-        chroms = get_all_chroms(genome=genome)
-    # Determine max workers
-    max_workers = len(chroms)
-    max_available = int(multiprocessing.cpu_count())
-    if max_cpu_frac is not None:
-        max_available = int(multiprocessing.cpu_count() * max_cpu_frac)
-    max_workers = min(max_workers, max_available)
-    logging.info(
-        f"{max_available} CPU cores available. Using {max_workers} threads for genome {genome}..."
     )
-    # Launch threads
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        futures = {executor.submit(process_chrom, chrom): chrom for chrom in chroms}
-        for future in as_completed(futures):
-            chrom = futures[future]
-            try:
-                future.result()
-            except Exception as e:
-                logging.error(f"Chromosome {chrom} raised an exception: {e}")
 def main():
     genomes = ["hg38", "hg19"]
-    frac_per_genome = round(1 / len(genomes), 1)
     for genome in genomes:
-        all_chroms = get_all_chroms(genome=genome)
-        parallel_extract_tfbs_with_context(
-            genome=genome,
-            flank5=500,
-            flank3=500,
-            control_run=True,  # if there's a flank, whether to also run without flank
-            out_dir=f"../data_files/processed/tfclust/{genome}",
-            allowed_tfs=None,  # e.g., ['CTCF', 'MAX']
-            chroms=None,
-            max_cpu_frac=frac_per_genome,
-        )
 if __name__ == "__main__":
     logger = logging.getLogger(__name__)
-    logging.basicConfig(
-        filename="download.log",
-        encoding="utf-8",
-        level=logging.DEBUG,
-        filemode="w",
-    )

+import os
+import logging
 import requests
+import pandas as pd
 import json
 import multiprocessing
+from math import ceil
+from datetime import datetime
 def get_all_tfs(genome: str = "hg38"):
     raw_data = pd.read_csv(
+        f"../../data_files/encode3TfbsClusteredWithCells.bed", sep="\t", header=None
     )
     raw_data.columns = ["chrom", "start", "end", "tf_name", "score", "cell_line"]
+    all_tfs = raw_data["tf_name"].unique().tolist()
     logging.info(f"Found {len(all_tfs)} transcription factors in genome {genome}.")
     return all_tfs
+def get_all_chroms(genome: str = "hg38", exclude: list=None, include: list=None, logger: logging.Logger=None):
     """
     Fetch all chromosome names for a genome.
     Note: some chromosomes are in unexpected formats (e.g. there is 'chr15', but also 'chr15_ML143371v1_fix')
     """
+    if logger is None:
+        logger = logging.getLogger(__name__)
     url = f"https://api.genome.ucsc.edu/list/chromosomes?genome={genome}"
     try:
         r = requests.get(url)
         r.raise_for_status()
     except:
+        logger.error(f"Failed to fetch all chromosomes for genome {genome}")
+    if include is not None and exclude is not None:
+        raise ValueError(f"Must pass EITHER exclude or include. Cannot pass both.")
+    all_chroms = list(r.json()["chromosomes"].keys())
+    if include is not None:
+        logger.info(f"Including only the following chromosomes: {include}")
+        all_chroms = [chrom for chrom in all_chroms if chrom in include]
+    if exclude is not None:
+        logger.info(f"Excluding the following chromosomes: {exclude}")
+        all_chroms = [chrom for chrom in all_chroms if not(chrom in exclude)]
+    logger.info(f"Found {len(all_chroms)} chromosomes in genome {genome}.")
     return all_chroms
+def get_all_chrom_fasta_files(genome: str = "hg38", exclude: list=None, include: list=None, logger: logging.Logger=None, out_dir="../../data_files/raw/genomes"):
+    """
+    Get FASTA files for each chromosome for a current genome
+    """
+    if logger is None:
+        logger = logging.getLogger(__name__)
+    if include is not None and exclude is not None:
+        raise ValueError(f"Must pass EITHER exclude or include. Cannot pass both.")
+    chroms = get_all_chroms(genome=genome, exclude=exclude, include=include, logger=logger)
+    genome_out_dir = os.path.join(out_dir,genome)
+    os.makedirs(genome_out_dir, exist_ok=True)
+    for chrom in chroms:
+        chrom_save_path = os.path.join(genome_out_dir,f"{genome}_{chrom}.json")
+        if not(os.path.exists(chrom_save_path)):
+            url = f"https://api.genome.ucsc.edu/getData/sequence?genome={genome};chrom={chrom}"
+            try:
+                r = requests.get(url)
+                r.raise_for_status()
+                json_output = r.json()
+                with open(chrom_save_path, "w") as f:
+                    json.dump(json_output, f, indent=4)
+                logger.info(f"Downloaded {chrom} in genome {genome}.")
+            except:
+                logger.error(f"Failed to fetch all {chrom} for genome {genome}")
+        else:
+            logger.info(f"Already downloaded {chrom} in genome {genome}. Skipping.")
+    logger.info(f"Downloaded {len(chroms)} chromosomes in genome {genome}.")
+    return chroms
+def fetch_tfbs_track(chrom: str, genome: str = "hg38", logger:logging.Logger=None):
     """
     Fetch raw data from the track encRegTfbsClustered.
     Returns json data for the specified chromosome, where key information appears as follows:
         ]
     """
+    if logger is None:
+        logger = logging.getLogger(__name__)
     params = {"genome": genome, "track": "encRegTfbsClustered", "chrom": chrom}
+    json_out_dir = os.path.join("../../data_files/raw/tfclust/encRegTfbsClustered_data", genome)
+    json_out_path = os.path.join(json_out_dir, f"{params['genome']}_{params['track']}_{params['chrom']}.json")
+    if not(os.path.exists(json_out_path)):
+        url = f"https://api.genome.ucsc.edu/getData/track?genome={params['genome']};track={params['track']};chrom={params['chrom']}"
+        try:
+            r = requests.get(url)
+            r.raise_for_status()
+            # Extract the output and save it
+            os.makedirs(json_out_dir, exist_ok=True)
+            # Save it
+            json_output = r.json()
+            with open(json_out_path, "w") as f:
+                json.dump(json_output, f, indent=4)
+            logger.info(
+                f"Saved to {json_out_path}"
+            )
+        except:
+            logger.error(
+                f"Failed to fetch encRegTfbsClustered for {chrom} in genome {genome}"
+            )
+    else:
+        logging.info(f"Already downloaded encRegTfbsClustered for {chrom} in {genome}. Skipping download.")
+        with open(json_out_path, "r") as f:
+            json_output = json.load(f)
     return json_output
 def get_sequence(
+    chrom_json: dict,
     start: int,
     end: int,
     flank5: int = 0,
     flank3: int = 0,
     genome: str = "hg38",
+    logger: logging.Logger=None
 ):
     """
+    Given genome, start position, end position, chromosome json, and desired flank size, extract the raw DNA sequence
+    chrom_json has keys: "downloadTime", "downloadTimeStamp","genome", "chrom", "start", "end", "dna"
+    """
+    if logger is None:
+        logger = logging.getLogger(__name__)
+    chrom_seq = chrom_json["dna"]
+    chrom = chrom_json["chrom"]
+    if chrom_json["start"] != 0:
+        logger.warning(f"Start position of chromosome is not 0. Start position: {chrom_json['start']}")
+    # Calculate new start and end indices
+    new_start = max(0, start - flank5)
+    new_end = end + flank3
+    if new_end > chrom_json["end"]:
+        logger.warning(f"Attempting to query {chrom} from {new_start} to {new_end}, but last index is {chrom_json['end']}. Manually setting last index to {chrom_json['end']}")
+        new_end = chrom_json['end']
     results_dict = {
         "chromStart": new_start,
         "chromEnd": new_end,
+        "seq": chrom_seq[new_start:new_end+1]
     }
     return results_dict
 def extract_tfbs_with_context(
+    genome: str,
+    flank5: int=500,
+    flank3: int=500,
+    allowed_tfs: list=None,
+    out_dir: str="../../data_files/processed/tfclust",
     control_run: bool = True,  # if there's a flank, whether to also run without flank
     chroms: list = None,
+    logger: logging.Logger=None,
+    redo: bool  = False, # whether to redo even if we've already processed this
+    idx: int=0  # index of worker
 ):
     """
+    Main method for a genome. By calling helpers, gets all chromosomes and their sequences, gets encRegTfbsClustered, and queries the feature indices in encRegTfbsClustered against chrom seqs for binding site sequences.
     """
+    if logger is None:
+        logger = logging.getLogger(__name__)
+    # Get all chromosomes for the current genome, including downloading thier sequences
     if chroms is None:
+        all_chroms = get_all_chrom_fasta_files(genome=genome, logger=logger)
+    else:
+        all_chroms = get_all_chrom_fasta_files(
+            genome=genome,
+            exclude=[c for c in get_all_chroms(genome) if c not in chroms],
+            logger=logger
         )
+    # For each chrom, (1) download full fasta sequence, (2) download encRegTfbsClustered, (3) query features from [2] through [1]
     # Initialize the final DF
     results_cols = [
         "bin",
         "flank3",
     ]
     results_init = pd.DataFrame(columns=results_cols)
     # Make a list of the types of runs we need
     queries = [{"flank5": flank5, "flank3": flank3}]
     if not ((flank5 == 0) and (flank3 == 0) and control_run):
         queries[0]["type"] = "flank"
     elif (flank5 == 0) and (flank3 == 0):
         queries[0]["type"] = "control"
+    merged_done_txt_path = os.path.join("../../data_files/processed/tfclust", genome, "logs", f"completed.txt")
+    done_txt_path = os.path.join("../../data_files/processed/tfclust", genome, "logs", f"completed_worker_{idx}.txt")
+    if os.path.exists(merged_done_txt_path):
+        completed_chroms = pd.read_csv(merged_done_txt_path, sep="\t")
+        completed_chroms = list(completed_chroms["chrom"])
+    else:
+        completed_chroms = []
+    with open(done_txt_path, "w") as f:
+        f.write("chrom\trow_count\n")
+    logger.info(f"{len(completed_chroms)} already complete: {','.join(completed_chroms)}")
+    count = 0
+    # Iterate through chromosomes (1) download encRegTfbsClustered, (2) query each feature in the chrom sequence
+    for chrom in all_chroms:
+        chrom_write_count = 0
+        chrom_output_fname = os.path.join("../../data_files/processed/tfclust", genome, f"encRegTfbsClustered_{genome}_{chrom}.csv")
+        # If we've already done it, no need
+        if chrom in completed_chroms and not(redo):
+            chrom_write_count = len(pd.read_csv(chrom_output_fname))
+            with open(done_txt_path, "a") as f:
+                f.write(f"{chrom}\t{chrom_write_count}\n")
+            continue
+        #### If we ARE processing this, process it!
+        # Load chromosome sequence
+        with open(os.path.join("../../data_files/raw/genomes",genome,f"{genome}_{chrom}.json"), "r") as f:
+            chrom_json = json.load(f)
         results_init.to_csv(
+            chrom_output_fname, index=False
         )
+        logger.info(f"Fetching {chrom}...")
         # Fetch the data json (has start and end positions in the chrom, but not the sequence)
         try:
+            data = fetch_tfbs_track(chrom, genome=genome, logger=logger)
+            logger.info(f"  → Fetched {chrom} successfully")
             features = data.get("encRegTfbsClustered", {})
+            logger.info(f"  → Found {len(features)} features")
         except Exception as e:
+            logger.info(f"  Failed to fetch {chrom}: {e}")
             continue
         # Get the sequences of the DNA binding sites
             # Check if tf is valid
             tf_name = feature.get("name", "UnknownTF")
             if allowed_tfs and tf_name not in allowed_tfs:
+                logger.warning(f"TF name {tf_name} not in allowed_tfs. Skipping.")
                 continue
             # Make sure the chromosomes match and we have the right sequence!
             assert (
                 feature["chrom"] == chrom
             end = feature["chromEnd"]
             for query in queries:
+                results_dict = get_sequence(
+                    chrom_json,
+                    start,
+                    end,
+                    flank5=query["flank5"],
+                    flank3=query["flank3"],
+                    genome=genome,
+                    logger = logger
                 )
+                # Add the returned info
+                if query["type"] == "control":
+                    new_row["seq"] = results_dict["seq"]    # note: these sequences will have soft-masked repeats!
+                elif query["type"] == "flank":
+                    new_row["seq_flanked"] = results_dict["seq"]
+                    new_row["chromStart_flanked"] = results_dict["chromStart"]
+                    new_row["chromEnd_flanked"] = results_dict["chromEnd"]
+                    new_row["flank5"] = flank5
+                    new_row["flank3"] = flank3
+            # Fill out any blank columns
+            try:
+                for c in results_cols:
+                    if c not in new_row:
+                        new_row[c] = None
+                new_row_df = pd.DataFrame(data=new_row, index=[0])
+                new_row_df = new_row_df[results_cols]   # assert the right column order
+                if new_row_df["seq"] is not None:
+                    new_row_df.to_csv(
+                        chrom_output_fname,
+                        mode="a",
+                        index=False,
+                        header=False,
+                    )
+                    logger.info(
+                        f"Wrote new row to {out_dir}/encRegTfbsClustered_{chrom}.csv"
+                    )
+                    chrom_write_count += 1
+                else:
+                    logger.info(f"Did not write new row. {new_row}")
+            except Exception as e:
+                logger.error(F"Failed to write new row to {out_dir}/encRegTfbsClustered_{chrom}.csv: error {e}")
+        logger.info(f"Done. Wrote {chrom_write_count} sequences to {out_dir}/encRegTfbsClustered_{chrom}.csv")
+        with open(done_txt_path, "a") as f:
+            f.write(f"{chrom}\t{chrom_write_count}\n")
+        count += chrom_write_count
+    logger.info(f"Done with all chroms. Wrote {count} sequences to {out_dir}.")
+def merge_completed_files(genome: str):
     """
+    Merge all completed_worker_*.txt files into a single completed.txt file
     """
+    logs_dir = os.path.join("../../data_files/processed/tfclust", genome, "logs")
+    merged_path = os.path.join(logs_dir, "completed.txt")
+    with open(merged_path, "w") as outfile:
+        outfile.write("chrom\trow_count\n")  # header
+        for fname in os.listdir(logs_dir):
+            if fname.startswith("completed_worker_") and fname.endswith(".txt"):
+                with open(os.path.join(logs_dir, fname), "r") as infile:
+                    for line in infile:
+                        if line.startswith("chrom"):  # skip header lines
+                            continue
+                        outfile.write(line)
+    print(f"Merged completed_worker_*.txt into {merged_path}")
+def worker(args):
     """
+    Worker function for parallel processing
     """
+    # Extract args
+    chrom_group, idx, genome, flank5, flank3, logs_dir = args
+    os.makedirs(logs_dir, exist_ok=True)
+    # Define logger
+    logger = logging.getLogger(f"worker_{idx}")
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+    log_file = os.path.join(logs_dir, f"worker_{idx}.log")
+    fh = logging.FileHandler(log_file, mode="w", encoding="utf-8")
+    fh.setLevel(logging.DEBUG)
+    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    fh.setFormatter(formatter)
+    logger.addHandler(fh)
+    logger.info(f"Starting worker {idx} for chromosomes: {chrom_group}")
+    extract_tfbs_with_context(
+        genome=genome,
+        flank5=flank5,
+        flank3=flank3,
+        allowed_tfs=None,
+        out_dir=f"../../data_files/processed/tfclust",
+        control_run=True,
+        chroms=chrom_group,
+        logger=logger,
+        idx=idx
     )
+    logger.info(f"Finished worker {idx}")
+def parallel_extract(genome: str, flank5: int, flank3: int):
+    """
+    Run extract_tfbs_with_context in parallel for groups of chromosomes in the genome to speed up processing.
+    """
+    chroms = get_all_chroms(genome)
+    num_cores = multiprocessing.cpu_count()
+    # Separate primary vs accessory chromosomes
+    primary_chroms = [c for c in chroms if "_" not in c]
+    accessory_chroms = [c for c in chroms if "_" in c]
+    # Distribute primary chromosomes round-robin across workers
+    chunks = [[] for _ in range(num_cores)]
+    for i, chrom in enumerate(primary_chroms):
+        chunks[i % num_cores].append(chrom)
+    # Now add accessory chromosomes to the least-loaded chunk (by count)
+    for chrom in accessory_chroms:
+        min_idx = min(range(num_cores), key=lambda i: len(chunks[i]))
+        chunks[min_idx].append(chrom)
+    # Log how we split it up - want to see which chromosomes are in which chunks.
+    logging.info(f"{num_cores} CPU cores available. Primary chromosomes distributed round-robin.")
+    for chunk_no, chunk in enumerate(chunks):
+        logging.info(f"Chunk {chunk_no}. Chromosomes = {','.join(chunk)}")
+    logs_dir = os.path.join("../../data_files/processed/tfclust", genome, "logs")
+    os.makedirs(logs_dir, exist_ok=True)
+    args_list = [(chunk, i, genome, flank5, flank3, logs_dir) for i, chunk in enumerate(chunks)]
+    with multiprocessing.Pool(processes=num_cores) as pool:
+        pool.map(worker, args_list)
+    merge_completed_files(genome)
 def main():
     genomes = ["hg38", "hg19"]
+    flank5 = 1000
+    flank3 = 1000
+    # Iterate through genomes
     for genome in genomes:
+        # Extract TF binding sites from bed - 500 flank
+        parallel_extract(genome, flank5, flank3)
 if __name__ == "__main__":
+    logging.basicConfig(filename="download.log", encoding="utf-8", level=logging.DEBUG, filemode="w")
     logger = logging.getLogger(__name__)
+    main()