svincoff commited on
Commit
b44075a
·
1 Parent(s): dca7b00
.gitignore CHANGED
@@ -1,3 +1,7 @@
1
  dpacman/data_files
2
  dpacman/data/tfclust/*.log
3
- dpacman/data/tfclust/temp.py
 
 
 
 
 
1
  dpacman/data_files
2
  dpacman/data/tfclust/*.log
3
+ dpacman/data/tfclust/temp.py
4
+ bigBedToBed
5
+ dpacman/data/remap/*.log
6
+ dpacman/data/remap/temp.py
7
+ dpacman/data/tfclust/figures
README.md CHANGED
@@ -13,12 +13,23 @@ license: cc-by-nc-nd-4.0
13
  │   │   ├── chip_atlas
14
  │   │   │   ├── full_data_loading.py
15
  │   │   │   └── smaller_data_loading.py
 
 
16
  │   │   └── tfclust
 
17
  │   │   ├── api_download.py
18
- │   │   ├── download.log
19
  │   │   ├── download.py
 
 
 
 
 
 
 
 
 
20
  │   │   ├── hg38_success_download.log
21
- │   │   └── temp.py
22
  │   └── data_files
23
  │   ├── processed
24
  │   │   └── tfclust
@@ -42,6 +53,11 @@ license: cc-by-nc-nd-4.0
42
  │   │   │   ├── hg19_chr1.json
43
  │   │   └── hg38
44
  │   │   ├── hg38_chr1.json
 
 
 
 
 
45
  │   └── tfclust
46
  │   ├── encRegTfbsClusteredWithCells.hg19.bed
47
  │   ├── encRegTfbsClusteredWithCells.hg38.bed
@@ -56,4 +72,8 @@ license: cc-by-nc-nd-4.0
56
  ```
57
  20 directories, 3089 files
58
 
59
- In `data_files` subfolders, only representative files for certain chromosomes are shown. In reality, any file that contains the substring "_chr" exists for every chromosome in that genome. Genome hg38 has 711 chromosomes. Genome hg19 has 298 chromosomes.
 
 
 
 
 
13
  │   │   ├── chip_atlas
14
  │   │   │   ├── full_data_loading.py
15
  │   │   │   └── smaller_data_loading.py
16
+ │   │   ├── remap
17
+ │   │   │   ├── analyze.py
18
  │   │   └── tfclust
19
+ │   │   ├── analyze.py
20
  │   │   ├── api_download.py
21
+ │   │   ├── combine.py
22
  │   │   ├── download.py
23
+ │   │   ├── figures
24
+ │   │   │   ├── seq_lengths_box.png
25
+ │   │   │   ├── seq_lengths_flanked_box.png
26
+ │   │   │   ├── seq_lengths_flanked_hist.png
27
+ │   │   │   ├── seq_lengths_flanked_xlog_box.png
28
+ │   │   │   ├── seq_lengths_flanked_xlog_hist.png
29
+ │   │   │   ├── seq_lengths_hist.png
30
+ │   │   │   ├── seq_lengths_xlog_box.png
31
+ │   │   │   └── seq_lengths_xlog_hist.png
32
  │   │   ├── hg38_success_download.log
 
33
  │   └── data_files
34
  │   ├── processed
35
  │   │   └── tfclust
 
53
  │   │   │   ├── hg19_chr1.json
54
  │   │   └── hg38
55
  │   │   ├── hg38_chr1.json
56
+ │   ├── remap
57
+ │   │   ├── reMap2022.bb
58
+ │   │   ├── reMap2022.bed
59
+ │   │   ├── remap2022_all_macs2_hg38_v1_0.bed.gz
60
+ │   │   └── remap2022_crm_macs2_hg38_v1_0.bed
61
  │   └── tfclust
62
  │   ├── encRegTfbsClusteredWithCells.hg19.bed
63
  │   ├── encRegTfbsClusteredWithCells.hg38.bed
 
72
  ```
73
  20 directories, 3089 files
74
 
75
+ In `data_files` subfolders, only representative files for certain chromosomes are shown. In reality, any file that contains the substring "_chr" exists for every chromosome in that genome. Genome hg38 has 711 chromosomes. Genome hg19 has 298 chromosomes. To reconstruct a full directory structure, run the following from `DPACMAN`
76
+
77
+ ```
78
+ tree -I '__pycache__|*.egg-info|*.git' > tree.txt
79
+ ```
dpacman/data/README.md CHANGED
@@ -15,4 +15,27 @@ gunzip encRegTfbsClusteredWithCells.hg38.bed.gz
15
  ```
16
  wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/encRegTfbsClustered/encRegTfbsClusteredWithCells.hg19.bed.gz
17
  gunzip encRegTfbsClusteredWithCells.hg19.bed.gz
18
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  ```
16
  wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/encRegTfbsClustered/encRegTfbsClusteredWithCells.hg19.bed.gz
17
  gunzip encRegTfbsClusteredWithCells.hg19.bed.gz
18
+ ```
19
+
20
+ 3. ReMap big bed file
21
+ ```
22
+ wget https://hgdownload.soe.ucsc.edu/gbdb/hg38/reMap/reMap2022.bb
23
+ wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigBedToBed
24
+ chmod +x bigBedToBed
25
+ ./bigBedToBed /home/a03-svincoff/DPACMAN/dpacman/data_files/raw/remap/reMap2022.bb /home/a03-svincoff/DPACMAN/dpacman/data_files/raw/remap/reMap2022.bed
26
+
27
+ ```
28
+
29
+ 4. ReMap CRM file from their actual website
30
+ ```
31
+ wget https://remap.univ-amu.fr/storage/remap2022/hg38/MACS2/remap2022_crm_macs2_hg38_v1_0.bed.gz
32
+ gunzip remap2022_crm_macs2_hg38_v1_0.bed.gz
33
+ ```
34
+
35
+ 4. Run `download.py` to download:
36
+ - Full sequences of each chromosome for genomes hg38 and hg19
37
+ - encRegTfbsClusteredWithCells, a table of clustered transcription factors by their binding sites, for hg38 and hg19
38
+ - processed databases per genome per chromosome with the following columns: "bin","chrom","chromStart","chromEnd","name","score","scoreCount","sourceIds","sourceScores","seq","seq_flanked","chromStart_flanked","chromEnd_flanked","flank5","flank3"
39
+
40
+ ### Data Processing
41
+ 1. Run `combine.py` to combine these individual files into one large DataFrame
dpacman/data/remap/analyze.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pandas as pd
3
+
4
+ def main(logger=None):
5
+ if logger is None:
6
+ logger = logging.getLogger(__name__)
7
+
8
+ # Read the BED file
9
+ bed_file_path = "../../data_files/raw/remap/reMap2022.bed"
10
+ df = pd.read_csv(bed_file_path, sep="\t", header=None)
11
+ df.columns = ["#chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "reserved", "TF", "Biotypes"]
12
+ print(f"{len(df):,}")
13
+ print(df.head(50))
14
+
15
+ crm_bed_file_path = "../../data_files/raw/remap/remap2022_crm_macs2_hg38_v1_0.bed"
16
+ crm = pd.read_csv(crm_bed_file_path, sep="\t", header=None)
17
+ crm.columns = ["#chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "reserved"]
18
+ crm["chromLen"] = crm["chromEnd"] - crm["chromStart"]
19
+ crm["thickLen"] = crm["thickEnd"] - crm["thickStart"]
20
+ print(f"{len(crm):,}")
21
+ print(f"thick length statistics:")
22
+ print(crm["thickLen"].describe())
23
+ print(f"chrom length statistics:")
24
+ print(crm["chromLen"].describe())
25
+ print(crm[["#chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "reserved"]].head(50))
26
+ crm.head(50).to_csv("crm_example.csv",index=False)
27
+
28
+ crm["name"] = crm["name"].apply(lambda x: x.split(","))
29
+ crm = crm.explode("name").reset_index(drop=True)
30
+ crm.loc[crm["name"]=="ERG"].reset_index(drop=True).head(50).to_csv("crm_example_ERG.csv",index=False)
31
+
32
+ if __name__ == "__main__":
33
+ log_path = "analyze.log"
34
+
35
+ logger = logging.getLogger(__name__)
36
+ logger.setLevel(logging.DEBUG)
37
+
38
+ # Create file handler
39
+ file_handler = logging.FileHandler(log_path, mode="w", encoding="utf-8")
40
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
41
+ file_handler.setFormatter(formatter)
42
+
43
+ # Attach handlers
44
+ logger.addHandler(file_handler)
45
+
46
+ main(logger)
dpacman/data/tfclust/analyze.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import logging
3
+ import os
4
+ import dask.dataframe as dd
5
+ import matplotlib.pyplot as plt
6
+
7
+ def plot_sequence_lengths_box(lengths, xlog=False, title="Sequence Lengths", out_dir="figures", fname="sequence_lengths_box.png"):
8
+ """
9
+ Plot sequence lengths. Can be used with original sequence or flank sequence.
10
+ """
11
+ os.makedirs(out_dir, exist_ok=True)
12
+ out_path = os.path.join(out_dir, fname)
13
+
14
+ plt.figure(figsize=(10, 4))
15
+ plt.boxplot(lengths, vert=False)
16
+ if xlog:
17
+ plt.xscale('log')
18
+ plt.xlabel("Sequence Length")
19
+ plt.title(title)
20
+ plt.grid(True, axis='y', linestyle='--', alpha=0.6)
21
+ plt.tight_layout()
22
+
23
+ plt.savefig(out_path, dpi=300)
24
+
25
+ def plot_sequence_lengths_hist(lengths, xlog=False, title="Sequence Lengths", out_dir="figures", fname="sequence_lengths_hist.png"):
26
+ """
27
+ Plot sequence lengths. Can be used with original sequence or flank sequence.
28
+ """
29
+ os.makedirs(out_dir, exist_ok=True)
30
+ out_path = os.path.join(out_dir, fname)
31
+
32
+ plt.figure(figsize=(10, 4))
33
+ plt.hist(lengths, bins=100, density=True, alpha=0.75)
34
+ if xlog:
35
+ plt.xscale('log')
36
+ # percentage format
37
+ plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.2%}'.format(100*y)))
38
+ plt.xlabel("Sequence Length")
39
+ plt.ylabel("Frequency")
40
+ plt.title(title)
41
+ plt.grid(True, axis='y', linestyle='--', alpha=0.6)
42
+ plt.tight_layout()
43
+
44
+ plt.savefig(out_path, dpi=300)
45
+
46
+ def main(logger):
47
+ df_dir = "../../data_files/processed/tfclust/combined"
48
+ df_savepath = os.path.join(df_dir, "encRegTfbsClustered_hg38_hg19.parquet")
49
+ logger.info("Starting to load data file from parquet")
50
+ df = pd.read_parquet(df_savepath, engine="auto")
51
+ logger.info(df.head())
52
+
53
+ plot_sequence_lengths_hist(df["seq_len"], title="TF Binding Sites",fname="seq_lengths_hist.png")
54
+ plot_sequence_lengths_hist(df["seq_flanked_len"], title="TF Binding Sites with 1000nt Flanks", fname="seq_lengths_flanked_hist.png")
55
+ plot_sequence_lengths_box(df["seq_len"], title="TF Binding Sites",fname="seq_lengths_box.png")
56
+ plot_sequence_lengths_box(df["seq_flanked_len"], title="TF Binding Sites with 1000nt Flanks", fname="seq_lengths_flanked_box.png")
57
+
58
+ plot_sequence_lengths_hist(df["seq_len"], xlog=True, title="TF Binding Sites",fname="seq_lengths_xlog_hist.png")
59
+ plot_sequence_lengths_hist(df["seq_flanked_len"], xlog=True, title="TF Binding Sites with 1000nt Flanks", fname="seq_lengths_flanked_xlog_hist.png")
60
+ plot_sequence_lengths_box(df["seq_len"], xlog=True, title="TF Binding Sites",fname="seq_lengths_xlog_box.png")
61
+ plot_sequence_lengths_box(df["seq_flanked_len"],xlog=True, title="TF Binding Sites with 1000nt Flanks", fname="seq_lengths_flanked_xlog_box.png")
62
+
63
+ if __name__ == "__main__":
64
+ log_path = "analyze.log"
65
+
66
+ logger = logging.getLogger(__name__)
67
+ logger.setLevel(logging.DEBUG)
68
+
69
+ # Create file handler
70
+ file_handler = logging.FileHandler(log_path, mode="w", encoding="utf-8")
71
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
72
+ file_handler.setFormatter(formatter)
73
+
74
+ # Attach handlers
75
+ logger.addHandler(file_handler)
76
+
77
+ main(logger)
dpacman/data/tfclust/combine.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import logging
3
+ import os
4
+ import dask.dataframe as dd
5
+
6
+ def main():
7
+ hg38_dir = "../../data_files/processed/tfclust/hg38"
8
+ hg19_dir = "../../data_files/processed/tfclust/hg19"
9
+
10
+ # See how many files there are
11
+ hg38_files = [os.path.join(hg38_dir,x) for x in os.listdir(hg38_dir) if os.path.isfile(os.path.join(hg38_dir,x))]
12
+ hg19_files = [os.path.join(hg19_dir,x) for x in os.listdir(hg19_dir) if os.path.isfile(os.path.join(hg19_dir,x))]
13
+
14
+ logging.info(f"Total hg38 files: {len(hg38_files)}")
15
+ logging.info(f"Total hg19 files: {len(hg19_files)}")
16
+
17
+ # See how many datapoints there are
18
+ hg38_complete = pd.read_csv(os.path.join(hg38_dir,"logs/completed.txt"), sep="\t")
19
+ hg19_complete = pd.read_csv(os.path.join(hg19_dir,"logs/completed.txt"), sep="\t")
20
+
21
+ logging.info(f"Total hg38 rows: {sum(hg38_complete['row_count']):,}")
22
+ logging.info(f"Total hg19 rows: {sum(hg19_complete['row_count']):,}")
23
+ logging.info(f"Total: {sum(hg38_complete['row_count']) + sum(hg19_complete['row_count']) :,}")
24
+
25
+ # Now try to combine all the files into one
26
+
27
+ # Read all CSVs in the folder as a single Dask dataframe
28
+ # Read each genome separately
29
+ full_df_hg38 = dd.read_csv(hg38_files)
30
+ full_df_hg38 = full_df_hg38.assign(genome="hg38") # ✅ Dask-safe assignment
31
+
32
+ full_df_hg19 = dd.read_csv(hg19_files)
33
+ full_df_hg19 = full_df_hg19.assign(genome="hg19")
34
+
35
+ # Concatenate both into one Dask DataFrame
36
+ full_df = dd.concat([full_df_hg38, full_df_hg19])
37
+
38
+ logging.info(f"Added all files to ccombined DataFrame. Total rows: {len(full_df)}")
39
+
40
+ full_df["seq_len"] = full_df["seq"].str.len()
41
+ full_df["seq_flanked_len"] = full_df["seq_flanked"].str.len()
42
+ logging.info(f"Added sequence length column.")
43
+
44
+ full_df_dir = "../../data_files/processed/tfclust/combined"
45
+ full_df_savepath = os.path.join(full_df_dir, "encRegTfbsClustered_hg38_hg19.parquet")
46
+ os.makedirs(full_df_dir, exist_ok=True)
47
+ full_df.to_parquet(full_df_savepath) # much faster and more compact
48
+ logging.info(f"Saved combined DataFrame to {full_df_savepath}.")
49
+
50
+
51
+ if __name__ == "__main__":
52
+ logging.basicConfig(filename="combine.log", encoding="utf-8", level=logging.DEBUG, filemode="w")
53
+
54
+ main()
dpacman/data/tfclust/download.py CHANGED
@@ -170,7 +170,7 @@ def get_sequence(
170
  results_dict = {
171
  "chromStart": new_start,
172
  "chromEnd": new_end,
173
- "seq": chrom_seq[new_start:new_end+1]
174
  }
175
  return results_dict
176
 
 
170
  results_dict = {
171
  "chromStart": new_start,
172
  "chromEnd": new_end,
173
+ "seq": chrom_seq[new_start:new_end]# end is NOT inclusive!!
174
  }
175
  return results_dict
176
 
environment.yaml CHANGED
@@ -22,7 +22,7 @@ channels:
22
 
23
  dependencies:
24
  - python=3.10
25
-
26
  - pip>=23
27
  - pip:
28
  - rootutils
 
22
 
23
  dependencies:
24
  - python=3.10
25
+ - dask[complete]
26
  - pip>=23
27
  - pip:
28
  - rootutils