| import pandas as pd |
| from omegaconf import DictConfig |
| from pathlib import Path |
| import os |
|
|
| import rootutils |
| from dpacman.utils import pylogger |
|
|
| root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True) |
| logger = pylogger.RankedLogger(__name__, rank_zero_only=True) |
|
|
|
|
| def clean_nr(nr_raw_path: Path | str): |
| """ |
| Clean the non-redundant peaks BED file. |
| Delete duplicate rows, assign columns, only keep columns we need. |
| """ |
| nr = pd.read_csv(nr_raw_path, sep="\t", header=None) |
| nr.columns = [ |
| "chrom", |
| "chromStart", |
| "chromEnd", |
| "biotypes", |
| "score", |
| "strand", |
| "thickStart", |
| "thickEnd", |
| "itemRgb", |
| ] |
|
|
| |
| biotype_colon_counts = ( |
| nr["biotypes"] |
| .str.count(":") |
| .value_counts() |
| .reset_index()["biotypes"] |
| .unique() |
| .tolist() |
| ) |
| assert biotype_colon_counts == [ |
| 1 |
| ] |
|
|
| |
| nr[["tr", "biotypes"]] = nr["biotypes"].str.split(":", expand=True) |
|
|
| |
| logger.info( |
| f"Keeping only the following columns: chrom, chromStart, chromEnd, biotypes, tr, score." |
| ) |
| nr = nr[["chrom", "chromStart", "chromEnd", "biotypes", "score", "tr"]] |
|
|
| |
| logger.info(f"Size of database before dropping duplicate rows: {len(nr)}") |
| nr = nr.drop_duplicates().reset_index(drop=True) |
| logger.info(f"Size of database after dropping duplicate rows: {len(nr)}") |
|
|
| |
| experiment_dups = len( |
| nr.loc[ |
| nr.duplicated(subset=["chrom", "chromStart", "chromEnd", "tr", "biotypes"]) |
| ] |
| ) |
| logger.info( |
| f"{experiment_dups} total rows with same chrom, chromStart, chromEnd, biotypes, tr but different score." |
| ) |
|
|
| logger.info( |
| f"Grouping by everything except score, comma-concatenating unique scores" |
| ) |
| nr = ( |
| nr.groupby(["chrom", "chromStart", "chromEnd", "tr", "biotypes"]) |
| .agg({"score": lambda x: ",".join(map(str, sorted(set(x))))}) |
| .reset_index() |
| ) |
|
|
| logger.info(f"Final database size: {len(nr)}") |
|
|
| nr["chromLen"] = nr["chromEnd"] - nr["chromStart"] |
|
|
| return nr |
|
|
|
|
| def clean_crm(crm_raw_path: Path | str): |
| """ |
| Clean the CRM BED file. |
| Delete duplicate rows, assign columns, only keep columns we need. |
| """ |
|
|
| crm = pd.read_csv(crm_raw_path, sep="\t", header=None) |
| crm.columns = [ |
| "chrom", |
| "chromStart", |
| "chromEnd", |
| "tr", |
| "score", |
| "strand", |
| "thickStart", |
| "thickEnd", |
| "reserved", |
| ] |
|
|
| |
| logger.info( |
| f"Keeping only the following columns: chrom, chromStart, chromEnd, tr, score." |
| ) |
| crm = crm[["chrom", "chromStart", "chromEnd", "tr", "score"]] |
|
|
| |
| logger.info(f"Size of database before dropping duplicate rows: {len(crm)}") |
| crm = crm.drop_duplicates().reset_index(drop=True) |
| logger.info(f"Size of database after dropping duplicate rows: {len(crm)}") |
|
|
| |
| experiment_dups = len( |
| crm.loc[crm.duplicated(subset=["chrom", "chromStart", "chromEnd", "tr"])] |
| ) |
| logger.info( |
| f"{experiment_dups} total rows with same chrom, chromStart, chromEnd, tr but different score." |
| ) |
|
|
| logger.info( |
| f"Grouping by everything except score, comma-concatenating unique scores" |
| ) |
| crm = ( |
| crm.groupby(["chrom", "chromStart", "chromEnd", "tr"]) |
| .agg({"score": lambda x: ",".join(map(str, sorted(set(x))))}) |
| .reset_index() |
| ) |
|
|
| logger.info(f"Final database size: {len(crm)}") |
|
|
| crm["chromLen"] = crm["chromEnd"] - crm["chromStart"] |
|
|
| return crm |
|
|
|
|
| def main(cfg: DictConfig): |
| |
| nr_raw_path = Path(root) / cfg.data_task.nr_raw_path |
| nr_processed_dir = Path(root) / cfg.data_task.nr_processed_dir |
| nr_processed_filename = cfg.data_task.nr_processed_filename |
| nr_savepath = os.path.join(nr_processed_dir, nr_processed_filename) |
|
|
| crm_raw_path = Path(root) / cfg.data_task.crm_raw_path |
| crm_processed_dir = Path(root) / cfg.data_task.crm_processed_dir |
| crm_processed_filename = cfg.data_task.crm_processed_filename |
| crm_savepath = os.path.join(crm_processed_dir, crm_processed_filename) |
|
|
| os.makedirs(nr_processed_dir, exist_ok=True) |
| os.makedirs(crm_processed_dir, exist_ok=True) |
|
|
| |
| if not (os.path.exists(nr_savepath)): |
| nr_cleaned = clean_nr(nr_raw_path) |
| nr_cleaned.to_csv(nr_savepath, sep="\t", index=False) |
| logger.info( |
| f"Saved cleaned non-redundant peaks (NR) database to: {nr_savepath}" |
| ) |
| else: |
| nr_cleaned = None |
| logger.info(f"File already exists at {nr_savepath}. Skipping") |
|
|
| |
| if not (os.path.exists(crm_savepath)): |
| crm_cleaned = clean_crm(crm_raw_path) |
| crm_cleaned.to_csv(crm_savepath, sep="\t", index=False) |
| logger.info( |
| f"Saved cleaned cis-regulatory modules (CRM) database to: {crm_savepath}" |
| ) |
| else: |
| crm_cleaned = None |
| logger.info(f"File already exists at {crm_savepath}. Skipping") |
|
|
| |
| if cfg.data_task.save_example_files: |
| example_nr_dir = nr_processed_dir / "examples" |
| os.makedirs(example_nr_dir, exist_ok=True) |
| example_nr_savepath = os.path.join( |
| example_nr_dir, "example500_" + nr_processed_filename |
| ) |
|
|
| if not (os.path.exists(example_nr_savepath)): |
| if nr_cleaned is None: |
| nr_cleaned = pd.read_csv(nr_savepath, sep="\t") |
| nr_cleaned.sample(n=500, random_state=42).reset_index(drop=True).to_csv( |
| example_nr_savepath, sep="\t", index=False |
| ) |
| logger.info( |
| f"Saved example NR file with 500 rows to: {example_nr_savepath}" |
| ) |
| else: |
| logger.info( |
| f"Example file already exists at {example_nr_savepath}. Skipping" |
| ) |
|
|
| |
| example_crm_dir = crm_processed_dir / "examples" |
| os.makedirs(example_crm_dir, exist_ok=True) |
| example_crm_savepath = os.path.join( |
| example_crm_dir, "example500_" + crm_processed_filename |
| ) |
| if not (os.path.exists(example_crm_savepath)): |
| if crm_cleaned is None: |
| crm_cleaned = pd.read_csv(crm_savepath, sep="\t") |
| crm_cleaned.sample(n=500, random_state=42).reset_index(drop=True).to_csv( |
| example_crm_savepath, sep="\t", index=False |
| ) |
| logger.info( |
| f"Saved example CRM file with 500 rows to: {example_crm_savepath}" |
| ) |
| else: |
| logger.info( |
| f"Example file already exists at {example_crm_savepath}. Skipping" |
| ) |
|
|
| |
| example_crm_tf_savepath = os.path.join( |
| example_crm_dir, "example500_ERG_" + crm_processed_filename |
| ) |
| if not (os.path.exists(example_crm_tf_savepath)): |
| if crm_cleaned is None: |
| crm_cleaned = pd.read_csv(crm_savepath, sep="\t") |
| crm_example_tf_db = crm_cleaned.copy(deep=True) |
| crm_example_tf_db["tr"] = crm_example_tf_db["tr"].apply( |
| lambda x: x.split(",") |
| ) |
| crm_example_tf_db = crm_example_tf_db.explode("tr").reset_index(drop=True) |
| crm_example_tf_db = crm_example_tf_db.loc[crm_example_tf_db["tr"] == "ERG"] |
| crm_example_tf_db = crm_example_tf_db.sample( |
| n=min(500, len(crm_example_tf_db)), random_state=42 |
| ).reset_index(drop=True) |
| crm_example_tf_db.to_csv(example_crm_tf_savepath, sep="\t", index=False) |
| logger.info( |
| f"Saved example CRM file for one TF with 500 rows to: {example_crm_tf_savepath}" |
| ) |
| else: |
| logger.info( |
| f"Example file already exists at {example_crm_tf_savepath}. Skipping" |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|