remap

Browse files

Files changed (8) hide show

.gitignore +5 -1
README.md +23 -3
dpacman/data/README.md +24 -1
dpacman/data/remap/analyze.py +46 -0
dpacman/data/tfclust/analyze.py +77 -0
dpacman/data/tfclust/combine.py +54 -0
dpacman/data/tfclust/download.py +1 -1
environment.yaml +1 -1

.gitignore CHANGED Viewed

@@ -1,3 +1,7 @@
 dpacman/data_files
 dpacman/data/tfclust/*.log
-dpacman/data/tfclust/temp.py

 dpacman/data_files
 dpacman/data/tfclust/*.log
+dpacman/data/tfclust/temp.py
+bigBedToBed
+dpacman/data/remap/*.log
+dpacman/data/remap/temp.py
+dpacman/data/tfclust/figures

README.md CHANGED Viewed

@@ -13,12 +13,23 @@ license: cc-by-nc-nd-4.0
 │   │   ├── chip_atlas
 │   │   │   ├── full_data_loading.py
 │   │   │   └── smaller_data_loading.py
 │   │   └── tfclust
 │   │       ├── api_download.py
-│   │       ├── download.log
 │   │       ├── download.py
 │   │       ├── hg38_success_download.log
-│   │       └── temp.py
 │   └── data_files
 │       ├── processed
 │       │   └── tfclust
@@ -42,6 +53,11 @@ license: cc-by-nc-nd-4.0
 │           │   │   ├── hg19_chr1.json
 │           │   └── hg38
 │           │       ├── hg38_chr1.json
 │           └── tfclust
 │               ├── encRegTfbsClusteredWithCells.hg19.bed
 │               ├── encRegTfbsClusteredWithCells.hg38.bed
@@ -56,4 +72,8 @@ license: cc-by-nc-nd-4.0
 ```
 20 directories, 3089 files
-In `data_files` subfolders, only representative files for certain chromosomes are shown. In reality, any file that contains the substring "_chr" exists for every chromosome in that genome. Genome hg38 has 711 chromosomes. Genome hg19 has 298 chromosomes.

 │   │   ├── chip_atlas
 │   │   │   ├── full_data_loading.py
 │   │   │   └── smaller_data_loading.py
+│   │   ├── remap
+│   │   │   ├── analyze.py
 │   │   └── tfclust
+│   │       ├── analyze.py
 │   │       ├── api_download.py
+│   │       ├── combine.py
 │   │       ├── download.py
+│   │       ├── figures
+│   │       │   ├── seq_lengths_box.png
+│   │       │   ├── seq_lengths_flanked_box.png
+│   │       │   ├── seq_lengths_flanked_hist.png
+│   │       │   ├── seq_lengths_flanked_xlog_box.png
+│   │       │   ├── seq_lengths_flanked_xlog_hist.png
+│   │       │   ├── seq_lengths_hist.png
+│   │       │   ├── seq_lengths_xlog_box.png
+│   │       │   └── seq_lengths_xlog_hist.png
 │   │       ├── hg38_success_download.log
 │   └── data_files
 │       ├── processed
 │       │   └── tfclust
 │           │   │   ├── hg19_chr1.json
 │           │   └── hg38
 │           │       ├── hg38_chr1.json
+│           ├── remap
+│           │   ├── reMap2022.bb
+│           │   ├── reMap2022.bed
+│           │   ├── remap2022_all_macs2_hg38_v1_0.bed.gz
+│           │   └── remap2022_crm_macs2_hg38_v1_0.bed
 │           └── tfclust
 │               ├── encRegTfbsClusteredWithCells.hg19.bed
 │               ├── encRegTfbsClusteredWithCells.hg38.bed
 ```
 20 directories, 3089 files
+In `data_files` subfolders, only representative files for certain chromosomes are shown. In reality, any file that contains the substring "_chr" exists for every chromosome in that genome. Genome hg38 has 711 chromosomes. Genome hg19 has 298 chromosomes. To reconstruct a full directory structure, run the following from `DPACMAN`
+```
+tree -I '__pycache__|*.egg-info|*.git' > tree.txt
+```

dpacman/data/README.md CHANGED Viewed

@@ -15,4 +15,27 @@ gunzip encRegTfbsClusteredWithCells.hg38.bed.gz
 ```
 wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/encRegTfbsClustered/encRegTfbsClusteredWithCells.hg19.bed.gz
 gunzip encRegTfbsClusteredWithCells.hg19.bed.gz
-```

 ```
 wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/encRegTfbsClustered/encRegTfbsClusteredWithCells.hg19.bed.gz
 gunzip encRegTfbsClusteredWithCells.hg19.bed.gz
+```
+3. ReMap big bed file
+```
+wget https://hgdownload.soe.ucsc.edu/gbdb/hg38/reMap/reMap2022.bb
+wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigBedToBed
+chmod +x bigBedToBed
+./bigBedToBed /home/a03-svincoff/DPACMAN/dpacman/data_files/raw/remap/reMap2022.bb /home/a03-svincoff/DPACMAN/dpacman/data_files/raw/remap/reMap2022.bed
+```
+4. ReMap CRM file from their actual website
+```
+wget https://remap.univ-amu.fr/storage/remap2022/hg38/MACS2/remap2022_crm_macs2_hg38_v1_0.bed.gz
+gunzip remap2022_crm_macs2_hg38_v1_0.bed.gz
+```
+4. Run `download.py` to download:
+- Full sequences of each chromosome for genomes hg38 and hg19
+- encRegTfbsClusteredWithCells, a table of clustered transcription factors by their binding sites, for hg38 and hg19
+- processed databases per genome per chromosome with the following columns: "bin","chrom","chromStart","chromEnd","name","score","scoreCount","sourceIds","sourceScores","seq","seq_flanked","chromStart_flanked","chromEnd_flanked","flank5","flank3"
+### Data Processing
+1. Run `combine.py` to combine these individual files into one large DataFrame

dpacman/data/remap/analyze.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import logging
+import pandas as pd
+def main(logger=None):
+    if logger is None:
+        logger = logging.getLogger(__name__)
+    # Read the BED file
+    bed_file_path = "../../data_files/raw/remap/reMap2022.bed"
+    df = pd.read_csv(bed_file_path, sep="\t", header=None)
+    df.columns = ["#chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "reserved", "TF", "Biotypes"]
+    print(f"{len(df):,}")
+    print(df.head(50))
+    crm_bed_file_path = "../../data_files/raw/remap/remap2022_crm_macs2_hg38_v1_0.bed"
+    crm = pd.read_csv(crm_bed_file_path, sep="\t", header=None)
+    crm.columns = ["#chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "reserved"]
+    crm["chromLen"] = crm["chromEnd"] - crm["chromStart"]
+    crm["thickLen"] = crm["thickEnd"] - crm["thickStart"]
+    print(f"{len(crm):,}")
+    print(f"thick length statistics:")
+    print(crm["thickLen"].describe())
+    print(f"chrom length statistics:")
+    print(crm["chromLen"].describe())
+    print(crm[["#chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "reserved"]].head(50))
+    crm.head(50).to_csv("crm_example.csv",index=False)
+    crm["name"] = crm["name"].apply(lambda x: x.split(","))
+    crm = crm.explode("name").reset_index(drop=True)
+    crm.loc[crm["name"]=="ERG"].reset_index(drop=True).head(50).to_csv("crm_example_ERG.csv",index=False)
+if __name__ == "__main__":
+    log_path = "analyze.log"
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.DEBUG)
+    # Create file handler
+    file_handler = logging.FileHandler(log_path, mode="w", encoding="utf-8")
+    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    file_handler.setFormatter(formatter)
+    # Attach handlers
+    logger.addHandler(file_handler)
+    main(logger)

dpacman/data/tfclust/analyze.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import pandas as pd
+import logging
+import os
+import dask.dataframe as dd
+import matplotlib.pyplot as plt
+def plot_sequence_lengths_box(lengths, xlog=False, title="Sequence Lengths", out_dir="figures", fname="sequence_lengths_box.png"):
+    """
+    Plot sequence lengths. Can be used with original sequence or flank sequence.
+    """
+    os.makedirs(out_dir, exist_ok=True)
+    out_path = os.path.join(out_dir, fname)
+    plt.figure(figsize=(10, 4))
+    plt.boxplot(lengths, vert=False)
+    if xlog:
+        plt.xscale('log')
+    plt.xlabel("Sequence Length")
+    plt.title(title)
+    plt.grid(True, axis='y', linestyle='--', alpha=0.6)
+    plt.tight_layout()
+    plt.savefig(out_path, dpi=300)
+def plot_sequence_lengths_hist(lengths, xlog=False, title="Sequence Lengths", out_dir="figures", fname="sequence_lengths_hist.png"):
+    """
+    Plot sequence lengths. Can be used with original sequence or flank sequence.
+    """
+    os.makedirs(out_dir, exist_ok=True)
+    out_path = os.path.join(out_dir, fname)
+    plt.figure(figsize=(10, 4))
+    plt.hist(lengths, bins=100, density=True, alpha=0.75)
+    if xlog:
+        plt.xscale('log')
+    # percentage format
+    plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.2%}'.format(100*y)))
+    plt.xlabel("Sequence Length")
+    plt.ylabel("Frequency")
+    plt.title(title)
+    plt.grid(True, axis='y', linestyle='--', alpha=0.6)
+    plt.tight_layout()
+    plt.savefig(out_path, dpi=300)
+def main(logger):
+    df_dir = "../../data_files/processed/tfclust/combined"
+    df_savepath = os.path.join(df_dir, "encRegTfbsClustered_hg38_hg19.parquet")
+    logger.info("Starting to load data file from parquet")
+    df = pd.read_parquet(df_savepath, engine="auto")
+    logger.info(df.head())
+    plot_sequence_lengths_hist(df["seq_len"], title="TF Binding Sites",fname="seq_lengths_hist.png")
+    plot_sequence_lengths_hist(df["seq_flanked_len"], title="TF Binding Sites with 1000nt Flanks", fname="seq_lengths_flanked_hist.png")
+    plot_sequence_lengths_box(df["seq_len"], title="TF Binding Sites",fname="seq_lengths_box.png")
+    plot_sequence_lengths_box(df["seq_flanked_len"], title="TF Binding Sites with 1000nt Flanks", fname="seq_lengths_flanked_box.png")
+    plot_sequence_lengths_hist(df["seq_len"], xlog=True, title="TF Binding Sites",fname="seq_lengths_xlog_hist.png")
+    plot_sequence_lengths_hist(df["seq_flanked_len"], xlog=True, title="TF Binding Sites with 1000nt Flanks", fname="seq_lengths_flanked_xlog_hist.png")
+    plot_sequence_lengths_box(df["seq_len"], xlog=True, title="TF Binding Sites",fname="seq_lengths_xlog_box.png")
+    plot_sequence_lengths_box(df["seq_flanked_len"],xlog=True,  title="TF Binding Sites with 1000nt Flanks", fname="seq_lengths_flanked_xlog_box.png")
+if __name__ == "__main__":
+    log_path = "analyze.log"
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.DEBUG)
+    # Create file handler
+    file_handler = logging.FileHandler(log_path, mode="w", encoding="utf-8")
+    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    file_handler.setFormatter(formatter)
+    # Attach handlers
+    logger.addHandler(file_handler)
+    main(logger)

dpacman/data/tfclust/combine.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import pandas as pd
+import logging
+import os
+import dask.dataframe as dd
+def main():
+    hg38_dir = "../../data_files/processed/tfclust/hg38"
+    hg19_dir = "../../data_files/processed/tfclust/hg19"
+    # See how many files there are
+    hg38_files = [os.path.join(hg38_dir,x) for x in os.listdir(hg38_dir) if os.path.isfile(os.path.join(hg38_dir,x))]
+    hg19_files = [os.path.join(hg19_dir,x) for x in os.listdir(hg19_dir) if os.path.isfile(os.path.join(hg19_dir,x))]
+    logging.info(f"Total hg38 files: {len(hg38_files)}")
+    logging.info(f"Total hg19 files: {len(hg19_files)}")
+    # See how many datapoints there are
+    hg38_complete = pd.read_csv(os.path.join(hg38_dir,"logs/completed.txt"), sep="\t")
+    hg19_complete = pd.read_csv(os.path.join(hg19_dir,"logs/completed.txt"), sep="\t")
+    logging.info(f"Total hg38 rows: {sum(hg38_complete['row_count']):,}")
+    logging.info(f"Total hg19 rows: {sum(hg19_complete['row_count']):,}")
+    logging.info(f"Total: {sum(hg38_complete['row_count']) + sum(hg19_complete['row_count']) :,}")
+    # Now try to combine all the files into one
+    # Read all CSVs in the folder as a single Dask dataframe
+    # Read each genome separately
+    full_df_hg38 = dd.read_csv(hg38_files)
+    full_df_hg38 = full_df_hg38.assign(genome="hg38")  # ✅ Dask-safe assignment
+    full_df_hg19 = dd.read_csv(hg19_files)
+    full_df_hg19 = full_df_hg19.assign(genome="hg19")
+    # Concatenate both into one Dask DataFrame
+    full_df = dd.concat([full_df_hg38, full_df_hg19])
+    logging.info(f"Added all files to ccombined DataFrame. Total rows: {len(full_df)}")
+    full_df["seq_len"] = full_df["seq"].str.len()
+    full_df["seq_flanked_len"] = full_df["seq_flanked"].str.len()
+    logging.info(f"Added sequence length column.")
+    full_df_dir = "../../data_files/processed/tfclust/combined"
+    full_df_savepath = os.path.join(full_df_dir, "encRegTfbsClustered_hg38_hg19.parquet")
+    os.makedirs(full_df_dir, exist_ok=True)
+    full_df.to_parquet(full_df_savepath)  # much faster and more compact
+    logging.info(f"Saved combined DataFrame to {full_df_savepath}.")
+if __name__ == "__main__":
+    logging.basicConfig(filename="combine.log", encoding="utf-8", level=logging.DEBUG, filemode="w")
+    main()

dpacman/data/tfclust/download.py CHANGED Viewed

@@ -170,7 +170,7 @@ def get_sequence(
     results_dict = {
         "chromStart": new_start,
         "chromEnd": new_end,
-        "seq": chrom_seq[new_start:new_end+1]
     }
     return results_dict

     results_dict = {
         "chromStart": new_start,
         "chromEnd": new_end,
+        "seq": chrom_seq[new_start:new_end]# end is NOT inclusive!!
     }
     return results_dict

environment.yaml CHANGED Viewed

@@ -22,7 +22,7 @@ channels:
 dependencies:
   - python=3.10
   - pip>=23
   - pip:
     - rootutils

 dependencies:
   - python=3.10
+  - dask[complete]
   - pip>=23
   - pip:
     - rootutils