recent changes

Browse files

Files changed (4) hide show

.gitignore +5 -1
dpacman/data/consistency.py +10 -0
dpacman/data/remap/analyze.py +2 -0
dpacman/data/tfclust/combine.py +64 -4

.gitignore CHANGED Viewed

@@ -4,4 +4,8 @@ dpacman/data/tfclust/temp.py
 bigBedToBed
 dpacman/data/remap/*.log
 dpacman/data/remap/temp.py
-dpacman/data/tfclust/figures

 bigBedToBed
 dpacman/data/remap/*.log
 dpacman/data/remap/temp.py
+dpacman/data/tfclust/figures
+dpacman/softwares
+dpacman/data/remap/crm_example.csv
+dpacman/data/remap/crm_example_ERG.csv
+tree.txt

dpacman/data/consistency.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+Check for consistency between Remap and Tfclust data
+"""
+import logging
+import pandas as pd
+import logging
+import os
+import dask.dataframe as dd

dpacman/data/remap/analyze.py CHANGED Viewed

@@ -10,6 +10,8 @@ def main(logger=None):
     df = pd.read_csv(bed_file_path, sep="\t", header=None)
     df.columns = ["#chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "reserved", "TF", "Biotypes"]
     print(f"{len(df):,}")
     print(df.head(50))
     crm_bed_file_path = "../../data_files/raw/remap/remap2022_crm_macs2_hg38_v1_0.bed"

     df = pd.read_csv(bed_file_path, sep="\t", header=None)
     df.columns = ["#chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "reserved", "TF", "Biotypes"]
     print(f"{len(df):,}")
+    crm["chromLen"] = crm["chromEnd"] - crm["chromStart"]
+    print(crm["chromLen"].describe())
     print(df.head(50))
     crm_bed_file_path = "../../data_files/raw/remap/remap2022_crm_macs2_hg38_v1_0.bed"

dpacman/data/tfclust/combine.py CHANGED Viewed

@@ -3,10 +3,7 @@ import logging
 import os
 import dask.dataframe as dd
-def main():
-    hg38_dir = "../../data_files/processed/tfclust/hg38"
-    hg19_dir = "../../data_files/processed/tfclust/hg19"
     # See how many files there are
     hg38_files = [os.path.join(hg38_dir,x) for x in os.listdir(hg38_dir) if os.path.isfile(os.path.join(hg38_dir,x))]
     hg19_files = [os.path.join(hg19_dir,x) for x in os.listdir(hg19_dir) if os.path.isfile(os.path.join(hg19_dir,x))]
@@ -46,7 +43,70 @@ def main():
     os.makedirs(full_df_dir, exist_ok=True)
     full_df.to_parquet(full_df_savepath)  # much faster and more compact
     logging.info(f"Saved combined DataFrame to {full_df_savepath}.")
 if __name__ == "__main__":
     logging.basicConfig(filename="combine.log", encoding="utf-8", level=logging.DEBUG, filemode="w")

 import os
 import dask.dataframe as dd
+def combine_hg38_hg19(hg38_dir, hg19_dir):
     # See how many files there are
     hg38_files = [os.path.join(hg38_dir,x) for x in os.listdir(hg38_dir) if os.path.isfile(os.path.join(hg38_dir,x))]
     hg19_files = [os.path.join(hg19_dir,x) for x in os.listdir(hg19_dir) if os.path.isfile(os.path.join(hg19_dir,x))]
     os.makedirs(full_df_dir, exist_ok=True)
     full_df.to_parquet(full_df_savepath)  # much faster and more compact
     logging.info(f"Saved combined DataFrame to {full_df_savepath}.")
+# Define the aggregation function
+def collapse_group(group):
+    return pd.Series({
+        "name": ",".join(group["name"]),
+        "score": ",".join(map(str, group["score"])),
+        "bin": ",".join(map(str, group["bin"])),
+        "scoreCount": ",".join(map(str, group["scoreCount"])),
+        "sourceIds": ",".join(map(str, group["sourceIds"])),
+        "sourceScores": ",".join(map(str, group["sourceScores"])),
+    })
+def reorg_like_remap(genome_dir, fname):
+    """
+    Reorganize a chromosome from tfclust processing to be in the format of remap files:
+    #chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,reserved,chromLen,thickLen
+    Original format of my processed tfclust files
+    bin,chrom,chromStart,chromEnd,name,score,scoreCount,sourceIds,sourceScores,seq,seq_flanked,chromStart_flanked,chromEnd_flanked,flank5,flank3
+    """
+    fpath = os.path.join(genome_dir, fname)
+    df = dd.read_csv(fpath)
+    # Show the head
+    print(df.head())
+    # Keep everything except the sequences
+    df = df[[
+        "chrom", "chromStart", "chromEnd", "name", "score", # same as other file
+        "bin","scoreCount","sourceIds","sourceScores"
+    ]].rename(columns={"chrom":"#chrom"})
+    # Apply groupby with known output types (meta)
+    meta = {
+        "name": str,
+        "score": str,
+        "bin": str,
+        "scoreCount": str,
+        "sourceIds": str,
+        "sourceScores": str
+    }
+    grouped = df.groupby(["#chrom", "chromStart", "chromEnd"]).apply(collapse_group, meta=meta)
+    # You can now compute it
+    result = grouped.compute()
+    # save the result
+    savepath = os.path.join(genome_dir, "remap_reorg")
+    os.makedirs(savepath, exist_ok=True)
+    savepath = os.path.join(savepath, fname.replace(".csv", "_reorg.csv"))
+    result.to_csv(os.path.join(genome_dir), index=True)
+    logging.info(f"Saved reorganized file to {savepath}")
+def main():
+    hg38_dir = "../../data_files/processed/tfclust/hg38"
+    hg19_dir = "../../data_files/processed/tfclust/hg19"
+    #combine_hg38_hg19(hg38_dir, hg19_dir)
+    for chrom in ["chr1"]:
+        reorg_like_remap(hg38_dir, f"encRegTfbsClustered_hg38_{chrom}.csv")
 if __name__ == "__main__":
     logging.basicConfig(filename="combine.log", encoding="utf-8", level=logging.DEBUG, filemode="w")