File size: 8,800 Bytes

a887ffc
 
 
 
 
29899b4
 
 
a887ffc
29899b4
a887ffc

import pandas as pd
from omegaconf import DictConfig
from pathlib import Path
import os

import rootutils
from dpacman.utils import pylogger

root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
logger = pylogger.RankedLogger(__name__, rank_zero_only=True)


def clean_nr(nr_raw_path: Path | str):
    """
    Clean the non-redundant peaks BED file.
    Delete duplicate rows, assign columns, only keep columns we need.
    """
    nr = pd.read_csv(nr_raw_path, sep="\t", header=None)
    nr.columns = [
        "chrom",
        "chromStart",
        "chromEnd",
        "biotypes",
        "score",
        "strand",
        "thickStart",
        "thickEnd",
        "itemRgb",
    ]

    # make sure we correctly interpret column "biotype" as having one transcription factor separated from all relevant biotypes by ONE colon
    biotype_colon_counts = (
        nr["biotypes"]
        .str.count(":")
        .value_counts()
        .reset_index()["biotypes"]
        .unique()
        .tolist()
    )
    assert biotype_colon_counts == [
        1
    ]  # confirm belief that : separates the name of a transcription factor from its biotype - just ONE biotype.

    # then split the column accordingly into tr (transcriptional regulator) and biotypes
    nr[["tr", "biotypes"]] = nr["biotypes"].str.split(":", expand=True)

    # group and concat the scores
    logger.info(
        f"Keeping only the following columns: chrom, chromStart, chromEnd, biotypes, tr, score."
    )
    nr = nr[["chrom", "chromStart", "chromEnd", "biotypes", "score", "tr"]]

    # drop duplicate rows - all fields
    logger.info(f"Size of database before dropping duplicate rows: {len(nr)}")
    nr = nr.drop_duplicates().reset_index(drop=True)
    logger.info(f"Size of database after dropping duplicate rows: {len(nr)}")

    # look for duplicate rows where it's clearly the same experiment but somehow different scores - chrom, chromStart, chromEnd, tr, biotypes
    experiment_dups = len(
        nr.loc[
            nr.duplicated(subset=["chrom", "chromStart", "chromEnd", "tr", "biotypes"])
        ]
    )
    logger.info(
        f"{experiment_dups} total rows with same chrom, chromStart, chromEnd, biotypes, tr but different score."
    )

    logger.info(
        f"Grouping by everything except score, comma-concatenating unique scores"
    )
    nr = (
        nr.groupby(["chrom", "chromStart", "chromEnd", "tr", "biotypes"])
        .agg({"score": lambda x: ",".join(map(str, sorted(set(x))))})
        .reset_index()
    )

    logger.info(f"Final database size: {len(nr)}")

    nr["chromLen"] = nr["chromEnd"] - nr["chromStart"]

    return nr


def clean_crm(crm_raw_path: Path | str):
    """
    Clean the CRM BED file.
    Delete duplicate rows, assign columns, only keep columns we need.
    """

    crm = pd.read_csv(crm_raw_path, sep="\t", header=None)
    crm.columns = [
        "chrom",
        "chromStart",
        "chromEnd",
        "tr",
        "score",
        "strand",
        "thickStart",
        "thickEnd",
        "reserved",
    ]

    # group and concat the scores
    logger.info(
        f"Keeping only the following columns: chrom, chromStart, chromEnd, tr, score."
    )
    crm = crm[["chrom", "chromStart", "chromEnd", "tr", "score"]]

    # drop duplicate rows - all fields
    logger.info(f"Size of database before dropping duplicate rows: {len(crm)}")
    crm = crm.drop_duplicates().reset_index(drop=True)
    logger.info(f"Size of database after dropping duplicate rows: {len(crm)}")

    # look for duplicate rows where it's clearly the same experiment but somehow different scores - chrom, chromStart, chromEnd, tr
    experiment_dups = len(
        crm.loc[crm.duplicated(subset=["chrom", "chromStart", "chromEnd", "tr"])]
    )
    logger.info(
        f"{experiment_dups} total rows with same chrom, chromStart, chromEnd, tr but different score."
    )

    logger.info(
        f"Grouping by everything except score, comma-concatenating unique scores"
    )
    crm = (
        crm.groupby(["chrom", "chromStart", "chromEnd", "tr"])
        .agg({"score": lambda x: ",".join(map(str, sorted(set(x))))})
        .reset_index()
    )

    logger.info(f"Final database size: {len(crm)}")

    crm["chromLen"] = crm["chromEnd"] - crm["chromStart"]

    return crm


def main(cfg: DictConfig):
    # Define the paths
    nr_raw_path = Path(root) / cfg.data_task.nr_raw_path
    nr_processed_dir = Path(root) / cfg.data_task.nr_processed_dir
    nr_processed_filename = cfg.data_task.nr_processed_filename
    nr_savepath = os.path.join(nr_processed_dir, nr_processed_filename)

    crm_raw_path = Path(root) / cfg.data_task.crm_raw_path
    crm_processed_dir = Path(root) / cfg.data_task.crm_processed_dir
    crm_processed_filename = cfg.data_task.crm_processed_filename
    crm_savepath = os.path.join(crm_processed_dir, crm_processed_filename)

    os.makedirs(nr_processed_dir, exist_ok=True)
    os.makedirs(crm_processed_dir, exist_ok=True)

    # Clean and save the non redundant peaks file
    if not (os.path.exists(nr_savepath)):
        nr_cleaned = clean_nr(nr_raw_path)
        nr_cleaned.to_csv(nr_savepath, sep="\t", index=False)
        logger.info(
            f"Saved cleaned non-redundant peaks (NR) database to: {nr_savepath}"
        )
    else:
        nr_cleaned = None
        logger.info(f"File already exists at {nr_savepath}. Skipping")

    # Clean and save the CRM file
    if not (os.path.exists(crm_savepath)):
        crm_cleaned = clean_crm(crm_raw_path)
        crm_cleaned.to_csv(crm_savepath, sep="\t", index=False)
        logger.info(
            f"Saved cleaned cis-regulatory modules (CRM) database to: {crm_savepath}"
        )
    else:
        crm_cleaned = None
        logger.info(f"File already exists at {crm_savepath}. Skipping")

    # Save example files
    if cfg.data_task.save_example_files:
        example_nr_dir = nr_processed_dir / "examples"
        os.makedirs(example_nr_dir, exist_ok=True)
        example_nr_savepath = os.path.join(
            example_nr_dir, "example500_" + nr_processed_filename
        )

        if not (os.path.exists(example_nr_savepath)):
            if nr_cleaned is None:
                nr_cleaned = pd.read_csv(nr_savepath, sep="\t")
            nr_cleaned.sample(n=500, random_state=42).reset_index(drop=True).to_csv(
                example_nr_savepath, sep="\t", index=False
            )
            logger.info(
                f"Saved example NR file with 500 rows to: {example_nr_savepath}"
            )
        else:
            logger.info(
                f"Example file already exists at {example_nr_savepath}. Skipping"
            )

        # CRM example
        example_crm_dir = crm_processed_dir / "examples"
        os.makedirs(example_crm_dir, exist_ok=True)
        example_crm_savepath = os.path.join(
            example_crm_dir, "example500_" + crm_processed_filename
        )
        if not (os.path.exists(example_crm_savepath)):
            if crm_cleaned is None:
                crm_cleaned = pd.read_csv(crm_savepath, sep="\t")
            crm_cleaned.sample(n=500, random_state=42).reset_index(drop=True).to_csv(
                example_crm_savepath, sep="\t", index=False
            )
            logger.info(
                f"Saved example CRM file with 500 rows to: {example_crm_savepath}"
            )
        else:
            logger.info(
                f"Example file already exists at {example_crm_savepath}. Skipping"
            )

        # CRM example for one transcription factor
        example_crm_tf_savepath = os.path.join(
            example_crm_dir, "example500_ERG_" + crm_processed_filename
        )
        if not (os.path.exists(example_crm_tf_savepath)):
            if crm_cleaned is None:
                crm_cleaned = pd.read_csv(crm_savepath, sep="\t")
            crm_example_tf_db = crm_cleaned.copy(deep=True)
            crm_example_tf_db["tr"] = crm_example_tf_db["tr"].apply(
                lambda x: x.split(",")
            )
            crm_example_tf_db = crm_example_tf_db.explode("tr").reset_index(drop=True)
            crm_example_tf_db = crm_example_tf_db.loc[crm_example_tf_db["tr"] == "ERG"]
            crm_example_tf_db = crm_example_tf_db.sample(
                n=min(500, len(crm_example_tf_db)), random_state=42
            ).reset_index(drop=True)
            crm_example_tf_db.to_csv(example_crm_tf_savepath, sep="\t", index=False)
            logger.info(
                f"Saved example CRM file for one TF with 500 rows to: {example_crm_tf_savepath}"
            )
        else:
            logger.info(
                f"Example file already exists at {example_crm_tf_savepath}. Skipping"
            )


if __name__ == "__main__":
    main()