svincoff commited on
Commit
1630441
·
1 Parent(s): b44075a

recent changes

Browse files
.gitignore CHANGED
@@ -4,4 +4,8 @@ dpacman/data/tfclust/temp.py
4
  bigBedToBed
5
  dpacman/data/remap/*.log
6
  dpacman/data/remap/temp.py
7
- dpacman/data/tfclust/figures
 
 
 
 
 
4
  bigBedToBed
5
  dpacman/data/remap/*.log
6
  dpacman/data/remap/temp.py
7
+ dpacman/data/tfclust/figures
8
+ dpacman/softwares
9
+ dpacman/data/remap/crm_example.csv
10
+ dpacman/data/remap/crm_example_ERG.csv
11
+ tree.txt
dpacman/data/consistency.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Check for consistency between Remap and Tfclust data
3
+ """
4
+
5
+ import logging
6
+ import pandas as pd
7
+ import logging
8
+ import os
9
+ import dask.dataframe as dd
10
+
dpacman/data/remap/analyze.py CHANGED
@@ -10,6 +10,8 @@ def main(logger=None):
10
  df = pd.read_csv(bed_file_path, sep="\t", header=None)
11
  df.columns = ["#chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "reserved", "TF", "Biotypes"]
12
  print(f"{len(df):,}")
 
 
13
  print(df.head(50))
14
 
15
  crm_bed_file_path = "../../data_files/raw/remap/remap2022_crm_macs2_hg38_v1_0.bed"
 
10
  df = pd.read_csv(bed_file_path, sep="\t", header=None)
11
  df.columns = ["#chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "reserved", "TF", "Biotypes"]
12
  print(f"{len(df):,}")
13
+ crm["chromLen"] = crm["chromEnd"] - crm["chromStart"]
14
+ print(crm["chromLen"].describe())
15
  print(df.head(50))
16
 
17
  crm_bed_file_path = "../../data_files/raw/remap/remap2022_crm_macs2_hg38_v1_0.bed"
dpacman/data/tfclust/combine.py CHANGED
@@ -3,10 +3,7 @@ import logging
3
  import os
4
  import dask.dataframe as dd
5
 
6
- def main():
7
- hg38_dir = "../../data_files/processed/tfclust/hg38"
8
- hg19_dir = "../../data_files/processed/tfclust/hg19"
9
-
10
  # See how many files there are
11
  hg38_files = [os.path.join(hg38_dir,x) for x in os.listdir(hg38_dir) if os.path.isfile(os.path.join(hg38_dir,x))]
12
  hg19_files = [os.path.join(hg19_dir,x) for x in os.listdir(hg19_dir) if os.path.isfile(os.path.join(hg19_dir,x))]
@@ -46,7 +43,70 @@ def main():
46
  os.makedirs(full_df_dir, exist_ok=True)
47
  full_df.to_parquet(full_df_savepath) # much faster and more compact
48
  logging.info(f"Saved combined DataFrame to {full_df_savepath}.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
 
 
50
 
51
  if __name__ == "__main__":
52
  logging.basicConfig(filename="combine.log", encoding="utf-8", level=logging.DEBUG, filemode="w")
 
3
  import os
4
  import dask.dataframe as dd
5
 
6
+ def combine_hg38_hg19(hg38_dir, hg19_dir):
 
 
 
7
  # See how many files there are
8
  hg38_files = [os.path.join(hg38_dir,x) for x in os.listdir(hg38_dir) if os.path.isfile(os.path.join(hg38_dir,x))]
9
  hg19_files = [os.path.join(hg19_dir,x) for x in os.listdir(hg19_dir) if os.path.isfile(os.path.join(hg19_dir,x))]
 
43
  os.makedirs(full_df_dir, exist_ok=True)
44
  full_df.to_parquet(full_df_savepath) # much faster and more compact
45
  logging.info(f"Saved combined DataFrame to {full_df_savepath}.")
46
+
47
+
48
+ # Define the aggregation function
49
+ def collapse_group(group):
50
+ return pd.Series({
51
+ "name": ",".join(group["name"]),
52
+ "score": ",".join(map(str, group["score"])),
53
+ "bin": ",".join(map(str, group["bin"])),
54
+ "scoreCount": ",".join(map(str, group["scoreCount"])),
55
+ "sourceIds": ",".join(map(str, group["sourceIds"])),
56
+ "sourceScores": ",".join(map(str, group["sourceScores"])),
57
+ })
58
+
59
+ def reorg_like_remap(genome_dir, fname):
60
+ """
61
+ Reorganize a chromosome from tfclust processing to be in the format of remap files:
62
+ #chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,reserved,chromLen,thickLen
63
+
64
+ Original format of my processed tfclust files
65
+ bin,chrom,chromStart,chromEnd,name,score,scoreCount,sourceIds,sourceScores,seq,seq_flanked,chromStart_flanked,chromEnd_flanked,flank5,flank3
66
+ """
67
+
68
+ fpath = os.path.join(genome_dir, fname)
69
+ df = dd.read_csv(fpath)
70
+
71
+ # Show the head
72
+ print(df.head())
73
+
74
+ # Keep everything except the sequences
75
+ df = df[[
76
+ "chrom", "chromStart", "chromEnd", "name", "score", # same as other file
77
+ "bin","scoreCount","sourceIds","sourceScores"
78
+ ]].rename(columns={"chrom":"#chrom"})
79
+
80
+ # Apply groupby with known output types (meta)
81
+ meta = {
82
+ "name": str,
83
+ "score": str,
84
+ "bin": str,
85
+ "scoreCount": str,
86
+ "sourceIds": str,
87
+ "sourceScores": str
88
+ }
89
+
90
+ grouped = df.groupby(["#chrom", "chromStart", "chromEnd"]).apply(collapse_group, meta=meta)
91
+
92
+ # You can now compute it
93
+ result = grouped.compute()
94
+
95
+ # save the result
96
+ savepath = os.path.join(genome_dir, "remap_reorg")
97
+ os.makedirs(savepath, exist_ok=True)
98
+ savepath = os.path.join(savepath, fname.replace(".csv", "_reorg.csv"))
99
+ result.to_csv(os.path.join(genome_dir), index=True)
100
+ logging.info(f"Saved reorganized file to {savepath}")
101
+
102
+ def main():
103
+ hg38_dir = "../../data_files/processed/tfclust/hg38"
104
+ hg19_dir = "../../data_files/processed/tfclust/hg19"
105
+
106
+ #combine_hg38_hg19(hg38_dir, hg19_dir)
107
 
108
+ for chrom in ["chr1"]:
109
+ reorg_like_remap(hg38_dir, f"encRegTfbsClustered_hg38_{chrom}.csv")
110
 
111
  if __name__ == "__main__":
112
  logging.basicConfig(filename="combine.log", encoding="utf-8", level=logging.DEBUG, filemode="w")