svincoff commited on
Commit
ad760ef
·
1 Parent(s): e7a2583

download structure

Browse files
README.md CHANGED
@@ -1,3 +1,29 @@
1
  ---
2
  license: cc-by-nc-nd-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: cc-by-nc-nd-4.0
3
  ---
4
+
5
+ # Directory Structure
6
+
7
+ ```
8
+ .
9
+ ├── README.md
10
+ ├── dpacman
11
+ │ ├── data
12
+ │ │ ├── README.md
13
+ │ │ ├── chip_atlas
14
+ │ │ │ ├── full_data_loading.py
15
+ │ │ │ └── smaller_data_loading.py
16
+ │ │ └── tfclust
17
+ │ │ └── download.py
18
+ │ └── data_files
19
+ │ ├── processed
20
+ │ │ └── tfclust
21
+ │ └── raw
22
+ │ ├── chip_atlas
23
+ │ │ └── experimentList.tab
24
+ │ └── tfclust
25
+ │ ├── encRegTfbsClusteredWithCells.hg19.bed
26
+ │ └── encRegTfbsClusteredWithCells.hg38.bed
27
+ ├── environment.yaml
28
+ └── setup.py
29
+ ```
dpacman/data/README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data download directory
2
+
3
+ ## UCSC
4
+
5
+ ### Raw data download
6
+ 1. `encRegTfbsClusteredWithCells.hg38.bed.gz`
7
+
8
+ ```
9
+ wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/encRegTfbsClustered/encRegTfbsClusteredWithCells.hg38.bed.gz
10
+ gunzip encRegTfbsClusteredWithCells.hg38.bed.gz
11
+ ```
12
+
13
+ 2. `encRegTfbsClusteredWithCells.hg19.bed.gz`
14
+
15
+ ```
16
+ wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/encRegTfbsClustered/encRegTfbsClusteredWithCells.hg19.bed.gz
17
+ gunzip encRegTfbsClusteredWithCells.hg19.bed.gz
18
+ ```
dpacman/data/chip_atlas/full_data_loading.py CHANGED
@@ -2,71 +2,96 @@ import pandas as pd
2
  from pathlib import Path
3
  import subprocess
4
 
5
- #Read only cols 0–2, no header
6
  df = pd.read_csv(
7
  "experimentList.tab",
8
  sep="\t",
9
  header=None,
10
- usecols=[0,1,2],
11
- names=["exp_id","genome","assay_group"],
12
  engine="python",
13
  on_bad_lines="skip",
14
- dtype=str
15
  )
16
 
17
- #Keep only known genome assemblies
18
  VALID_GENOMES = {
19
- "hg19","hg38",
20
- "mm9","mm10",
 
 
21
  "rn6",
22
- "dm3","dm6",
23
- "ce10","ce11",
24
- "sacCer3"
 
 
25
  }
26
  df = df[df["genome"].isin(VALID_GENOMES)]
27
  print("Assemblies in filtered data:", df["genome"].unique())
28
 
29
- #Classify assay type
 
30
  def modality(track):
31
  t = track.lower()
32
- if "atac" in t: return "ATAC"
33
- if "dnase" in t: return "DNase"
34
- if "bisulfite" in t or "methyl" in t: return "BS"
 
 
 
35
  return "ChIP"
 
 
36
  df["modality"] = df["assay_group"].apply(modality)
37
 
38
- #URL templates
 
39
  def make_urls(exp, genome, mod):
40
  urls = []
41
- if mod in ("ChIP","ATAC","DNase"):
42
  urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bw/{exp}.bw")
43
- for thr in ("05","10","20"):
44
- urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bed{thr}/{exp}.{thr}.bed")
45
- urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bb{thr}/{exp}.{thr}.bb")
 
 
 
 
46
  else:
47
- urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/methyl/{exp}.methyl.bw")
48
- urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/cover/{exp}.cover.bw")
49
- for sub in ("hmr","pmd","hypermr"):
50
- urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/{sub}/Bed/{exp}.{sub}.bed")
51
- urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/{sub}/BigBed/{exp}.{sub}.bb")
 
 
 
 
 
 
 
 
52
  return urls
53
 
54
- #Write URL lists per genome
55
- urls_dir = Path("urls_by_genome"); urls_dir.mkdir(exist_ok=True)
 
 
56
  for genome, group in df.groupby("genome"):
57
  all_urls = []
58
  for _, row in group.iterrows():
59
  all_urls += make_urls(row.exp_id, genome, row.modality)
60
  uniq = sorted(set(all_urls))
61
- (urls_dir/f"urls_{genome}.txt").write_text("\n".join(uniq))
62
  print(f"{genome}: {len(uniq)} URLs")
63
 
64
- #Download into raw/{genome}/
65
  for url_file in urls_dir.glob("urls_*.txt"):
66
- genome = url_file.stem.split("_",1)[1]
67
- dest = Path("raw")/genome
68
  dest.mkdir(parents=True, exist_ok=True)
69
  print(f"\nDownloading {genome} → {dest}/…")
70
- subprocess.run(["wget","-nc","-i",str(url_file),"-P",str(dest)], check=True)
71
 
72
- print("Done! Check raw/{genome}/ for your files.")
 
2
  from pathlib import Path
3
  import subprocess
4
 
5
+ # Read only cols 0–2, no header
6
  df = pd.read_csv(
7
  "experimentList.tab",
8
  sep="\t",
9
  header=None,
10
+ usecols=[0, 1, 2],
11
+ names=["exp_id", "genome", "assay_group"],
12
  engine="python",
13
  on_bad_lines="skip",
14
+ dtype=str,
15
  )
16
 
17
+ # Keep only known genome assemblies
18
  VALID_GENOMES = {
19
+ "hg19",
20
+ "hg38",
21
+ "mm9",
22
+ "mm10",
23
  "rn6",
24
+ "dm3",
25
+ "dm6",
26
+ "ce10",
27
+ "ce11",
28
+ "sacCer3",
29
  }
30
  df = df[df["genome"].isin(VALID_GENOMES)]
31
  print("Assemblies in filtered data:", df["genome"].unique())
32
 
33
+
34
+ # Classify assay type
35
  def modality(track):
36
  t = track.lower()
37
+ if "atac" in t:
38
+ return "ATAC"
39
+ if "dnase" in t:
40
+ return "DNase"
41
+ if "bisulfite" in t or "methyl" in t:
42
+ return "BS"
43
  return "ChIP"
44
+
45
+
46
  df["modality"] = df["assay_group"].apply(modality)
47
 
48
+
49
+ # URL templates
50
  def make_urls(exp, genome, mod):
51
  urls = []
52
+ if mod in ("ChIP", "ATAC", "DNase"):
53
  urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bw/{exp}.bw")
54
+ for thr in ("05", "10", "20"):
55
+ urls.append(
56
+ f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bed{thr}/{exp}.{thr}.bed"
57
+ )
58
+ urls.append(
59
+ f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bb{thr}/{exp}.{thr}.bb"
60
+ )
61
  else:
62
+ urls.append(
63
+ f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/methyl/{exp}.methyl.bw"
64
+ )
65
+ urls.append(
66
+ f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/cover/{exp}.cover.bw"
67
+ )
68
+ for sub in ("hmr", "pmd", "hypermr"):
69
+ urls.append(
70
+ f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/{sub}/Bed/{exp}.{sub}.bed"
71
+ )
72
+ urls.append(
73
+ f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/{sub}/BigBed/{exp}.{sub}.bb"
74
+ )
75
  return urls
76
 
77
+
78
+ # Write URL lists per genome
79
+ urls_dir = Path("urls_by_genome")
80
+ urls_dir.mkdir(exist_ok=True)
81
  for genome, group in df.groupby("genome"):
82
  all_urls = []
83
  for _, row in group.iterrows():
84
  all_urls += make_urls(row.exp_id, genome, row.modality)
85
  uniq = sorted(set(all_urls))
86
+ (urls_dir / f"urls_{genome}.txt").write_text("\n".join(uniq))
87
  print(f"{genome}: {len(uniq)} URLs")
88
 
89
+ # Download into raw/{genome}/
90
  for url_file in urls_dir.glob("urls_*.txt"):
91
+ genome = url_file.stem.split("_", 1)[1]
92
+ dest = Path("raw") / genome
93
  dest.mkdir(parents=True, exist_ok=True)
94
  print(f"\nDownloading {genome} → {dest}/…")
95
+ subprocess.run(["wget", "-nc", "-i", str(url_file), "-P", str(dest)], check=True)
96
 
97
+ print("Done! Check raw/{genome}/ for your files.")
dpacman/data/chip_atlas/smaller_data_loading.py CHANGED
@@ -12,7 +12,18 @@ from tqdm import tqdm
12
  TARGET_REGIONS = 200_000
13
 
14
  # Assemblies to include
15
- ASSEMBLIES = ["hg19","hg38","mm9","mm10","rn6","dm3","dm6","ce10","ce11","sacCer3"]
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # How many experiments to sample at most per protein (tune up/down)
18
  MAX_EXPS_PER_PROTEIN = 50
@@ -25,64 +36,67 @@ WORKDIR = Path("chip_atlas_fetch")
25
  WORKDIR.mkdir(exist_ok=True)
26
  LIST_DIR = WORKDIR / "lists"
27
  LIST_DIR.mkdir(exist_ok=True)
28
- DL_DIR = WORKDIR / "downloads"
29
  DL_DIR.mkdir(exist_ok=True)
30
 
31
  # ─── HELPERS ──────────────────────────────────────────────────────────────────
32
 
 
33
  def download_and_extract(url, extract_to: Path):
34
  """Fetch a ZIP and unzip it."""
35
  local = extract_to / Path(url).name
36
  if not local.exists():
37
  print(f"→ Downloading {url}")
38
- resp = requests.get(url, stream=True); resp.raise_for_status()
 
39
  with open(local, "wb") as f:
40
- for chunk in resp.iter_content(1<<20):
41
  f.write(chunk)
42
  with zipfile.ZipFile(local, "r") as z:
43
  z.extractall(extract_to)
44
 
 
45
  # ─── 1) GET MASTER LISTS ────────────────────────────────────────────────────
46
 
47
  print("1) Fetching master file & experiment lists…")
48
- FILELIST_URL = "https://dbarchive.biosciencedbc.jp/data/chip-atlas/LATEST/chip_atlas_file_list.zip"
49
- EXPERIMENTLIST_URL= "https://dbarchive.biosciencedbc.jp/data/chip-atlas/LATEST/chip_atlas_experiment_list.zip"
 
 
50
 
51
  download_and_extract(FILELIST_URL, LIST_DIR)
52
  download_and_extract(EXPERIMENTLIST_URL, LIST_DIR)
53
 
54
- filelist_txt = LIST_DIR / "chip_atlas_file_list.csv"
55
- experiment_txt = LIST_DIR / "chip_atlas_experiment_list.csv"
56
 
57
  # ─── 2) PARSE EXPERIMENT METADATA ────────────────────────────────────────────
58
 
59
  print("2) Parsing experiment → protein lookup…")
60
  exp_df = pd.read_csv(
61
  experiment_txt,
62
- sep=None, # let python engine guess (comma vs. tab)
63
- engine="python", # required when sep=None
64
- encoding="latin1" # to avoid UnicodeDecodeErrors
65
  )
66
 
67
  print("Columns in experiment list:", exp_df.columns.tolist())
68
 
69
- exp_df = (
70
- exp_df
71
- .loc[:, ['Experimental ID', 'Genome assembly', 'Antigen']]
72
- .rename(columns={
73
- 'Experimental ID': 'exp_id',
74
- 'Genome assembly': 'genome',
75
- 'Antigen': 'assay_group'
76
- })
77
  )
78
 
79
- exp_df['protein'] = exp_df['assay_group'].str.replace(r'_ChIP.*', '', regex=True)
80
 
81
  # Finally, filter to only the assemblies you care about:
82
- exp_df = exp_df[exp_df['genome'].isin(ASSEMBLIES)]
83
 
84
  # build lookup
85
- exp_to_genome = exp_df.set_index("exp_id")["genome"].to_dict()
86
  exp_to_protein = exp_df.set_index("exp_id")["protein"].to_dict()
87
 
88
  # ─── 3) BUILD URL LIST DIRECTLY ───────────────────────────────────────────────
@@ -93,11 +107,12 @@ urls_by_exp = {}
93
  for exp, genome in exp_to_genome.items():
94
  urls_by_exp[exp] = [
95
  f"{BASE}/data/{genome}/eachData/bw/{exp}.bw",
96
- f"{BASE}/data/{genome}/eachData/bed10/{exp}.10.bed"
97
  ]
98
 
99
  # bucket experiments by protein
100
  from collections import defaultdict
 
101
  prot_exps = defaultdict(list)
102
  for exp, prot in exp_to_protein.items():
103
  if exp in urls_by_exp:
@@ -127,14 +142,19 @@ print(f" → Wrote {len(final_urls):,} URLs to {url_list_file}")
127
  # ─── 4) PARALLEL DOWNLOAD VIA aria2c ─────────────────────────────────────────
128
 
129
  print("4) Downloading with aria2c…")
130
- subprocess.run([
131
- "aria2c",
132
- f"-x{ARIA2C_CONN}",
133
- "--dir", str(DL_DIR),
134
- "--input-file", str(url_list_file),
135
- "--auto-file-renaming=false",
136
- "--allow-overwrite=true"
137
- ], check=True)
 
 
 
 
 
138
 
139
  print("✅ Finished downloading all selected files.")
140
  print(f"Your files are in: {DL_DIR.resolve()}")
 
12
  TARGET_REGIONS = 200_000
13
 
14
  # Assemblies to include
15
+ ASSEMBLIES = [
16
+ "hg19",
17
+ "hg38",
18
+ "mm9",
19
+ "mm10",
20
+ "rn6",
21
+ "dm3",
22
+ "dm6",
23
+ "ce10",
24
+ "ce11",
25
+ "sacCer3",
26
+ ]
27
 
28
  # How many experiments to sample at most per protein (tune up/down)
29
  MAX_EXPS_PER_PROTEIN = 50
 
36
  WORKDIR.mkdir(exist_ok=True)
37
  LIST_DIR = WORKDIR / "lists"
38
  LIST_DIR.mkdir(exist_ok=True)
39
+ DL_DIR = WORKDIR / "downloads"
40
  DL_DIR.mkdir(exist_ok=True)
41
 
42
  # ─── HELPERS ──────────────────────────────────────────────────────────────────
43
 
44
+
45
  def download_and_extract(url, extract_to: Path):
46
  """Fetch a ZIP and unzip it."""
47
  local = extract_to / Path(url).name
48
  if not local.exists():
49
  print(f"→ Downloading {url}")
50
+ resp = requests.get(url, stream=True)
51
+ resp.raise_for_status()
52
  with open(local, "wb") as f:
53
+ for chunk in resp.iter_content(1 << 20):
54
  f.write(chunk)
55
  with zipfile.ZipFile(local, "r") as z:
56
  z.extractall(extract_to)
57
 
58
+
59
  # ─── 1) GET MASTER LISTS ────────────────────────────────────────────────────
60
 
61
  print("1) Fetching master file & experiment lists…")
62
+ FILELIST_URL = (
63
+ "https://dbarchive.biosciencedbc.jp/data/chip-atlas/LATEST/chip_atlas_file_list.zip"
64
+ )
65
+ EXPERIMENTLIST_URL = "https://dbarchive.biosciencedbc.jp/data/chip-atlas/LATEST/chip_atlas_experiment_list.zip"
66
 
67
  download_and_extract(FILELIST_URL, LIST_DIR)
68
  download_and_extract(EXPERIMENTLIST_URL, LIST_DIR)
69
 
70
+ filelist_txt = LIST_DIR / "chip_atlas_file_list.csv"
71
+ experiment_txt = LIST_DIR / "chip_atlas_experiment_list.csv"
72
 
73
  # ─── 2) PARSE EXPERIMENT METADATA ────────────────────────────────────────────
74
 
75
  print("2) Parsing experiment → protein lookup…")
76
  exp_df = pd.read_csv(
77
  experiment_txt,
78
+ sep=None, # let python engine guess (comma vs. tab)
79
+ engine="python", # required when sep=None
80
+ encoding="latin1", # to avoid UnicodeDecodeErrors
81
  )
82
 
83
  print("Columns in experiment list:", exp_df.columns.tolist())
84
 
85
+ exp_df = exp_df.loc[:, ["Experimental ID", "Genome assembly", "Antigen"]].rename(
86
+ columns={
87
+ "Experimental ID": "exp_id",
88
+ "Genome assembly": "genome",
89
+ "Antigen": "assay_group",
90
+ }
 
 
91
  )
92
 
93
+ exp_df["protein"] = exp_df["assay_group"].str.replace(r"_ChIP.*", "", regex=True)
94
 
95
  # Finally, filter to only the assemblies you care about:
96
+ exp_df = exp_df[exp_df["genome"].isin(ASSEMBLIES)]
97
 
98
  # build lookup
99
+ exp_to_genome = exp_df.set_index("exp_id")["genome"].to_dict()
100
  exp_to_protein = exp_df.set_index("exp_id")["protein"].to_dict()
101
 
102
  # ─── 3) BUILD URL LIST DIRECTLY ───────────────────────────────────────────────
 
107
  for exp, genome in exp_to_genome.items():
108
  urls_by_exp[exp] = [
109
  f"{BASE}/data/{genome}/eachData/bw/{exp}.bw",
110
+ f"{BASE}/data/{genome}/eachData/bed10/{exp}.10.bed",
111
  ]
112
 
113
  # bucket experiments by protein
114
  from collections import defaultdict
115
+
116
  prot_exps = defaultdict(list)
117
  for exp, prot in exp_to_protein.items():
118
  if exp in urls_by_exp:
 
142
  # ─── 4) PARALLEL DOWNLOAD VIA aria2c ─────────────────────────────────────────
143
 
144
  print("4) Downloading with aria2c…")
145
+ subprocess.run(
146
+ [
147
+ "aria2c",
148
+ f"-x{ARIA2C_CONN}",
149
+ "--dir",
150
+ str(DL_DIR),
151
+ "--input-file",
152
+ str(url_list_file),
153
+ "--auto-file-renaming=false",
154
+ "--allow-overwrite=true",
155
+ ],
156
+ check=True,
157
+ )
158
 
159
  print("✅ Finished downloading all selected files.")
160
  print(f"Your files are in: {DL_DIR.resolve()}")
dpacman/data/tfclust/download.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from time import sleep
3
+ import json
4
+ import logging
5
+ import multiprocessing
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ import os
8
+ import pandas as pd
9
+
10
+
11
+ def get_all_tfs(genome: str = "hg38"):
12
+ """
13
+ Get all the transcription factors from the appropriate encRegTfbsClusteredWithCells.genome.bed file.
14
+ Available in data_files/raw/tfclust for genomes hg38 and hg19
15
+ """
16
+ # Read raw file
17
+ raw_data = pd.read_csv(
18
+ "../data_files/encode3TfbsClusteredWithCells.bed", sep="\t", header=None
19
+ )
20
+ raw_data.columns = ["chrom", "start", "end", "tf_name", "score", "cell_line"]
21
+
22
+ # Extract all unique TF names
23
+ all_tfs = encode_raw["tf_name"].unique().tolist()
24
+ logging.info(f"Found {len(all_tfs)} transcription factors in genome {genome}.")
25
+
26
+ return all_tfs
27
+
28
+
29
+ def get_all_chroms(genome: str = "hg38"):
30
+ """
31
+ Fetch all chromosome names for a genome.
32
+ Note: some chromosomes are in unexpected formats (e.g. there is 'chr15', but also 'chr15_ML143371v1_fix')
33
+ """
34
+ url = f"https://api.genome.ucsc.edu/list/chromosomes?genome={genome}"
35
+ try:
36
+ r = requests.get(url)
37
+ r.raise_for_status()
38
+ except:
39
+ raise ValueError(f"Failed to fetch all chromosomes for genome {genome}")
40
+
41
+ all_chroms = [chrom for chrom in r.json()["chromosomes"]]
42
+ logging.info(f"Found {len(all_chroms)} chromosomes in genome {genome}.")
43
+
44
+ return all_chroms
45
+
46
+
47
+ def fetch_tfbs_track(chrom: str, genome: str = "hg38"):
48
+ """
49
+ Fetch raw data from the track encRegTfbsClustered.
50
+ Returns json data for the specified chromosome, where key information appears as follows:
51
+ "encRegTfbsClustered": [
52
+ {
53
+ "bin": 585,
54
+ "chrom": "chr1",
55
+ "chromStart": 9917,
56
+ "chromEnd": 10247,
57
+ "name": "NUFIP1",
58
+ "score": 680,
59
+ "sourceCount": 1,
60
+ "sourceIds": "1063",
61
+ "sourceScores": "680"
62
+ },...
63
+ ]
64
+
65
+ """
66
+ params = {"genome": genome, "track": "encRegTfbsClustered", "chrom": chrom}
67
+ url = f"https://api.genome.ucsc.edu/getData/track?genome={params['genome']};track={params['track']};chrom={params['chrom']}"
68
+ try:
69
+ r = requests.get(url)
70
+ r.raise_for_status()
71
+ except:
72
+ raise ValueError(
73
+ f"Failed to fetch encRegTfbsClustered for {chrom} in genome {genome}"
74
+ )
75
+
76
+ # Extract the output and save it
77
+ json_out_dir = f"../data_files/raw/tfclust/encRegTfbsClustered_data/{genome}"
78
+ os.makedirs(json_out_dir, exist_ok=True)
79
+
80
+ # Save it
81
+ json_output = r.json()
82
+ with open(
83
+ f"{json_out_dir}/{params['genome']}_{params['track']}_{params['chrom']}.json",
84
+ "w",
85
+ ) as f:
86
+ json.dump(json_output, f, indent=4)
87
+
88
+ logging.info(
89
+ f"Saved to {json_out_dir}/{params['genome']}_{params['track']}_{params['chrom']}.json"
90
+ )
91
+ return json_output
92
+
93
+
94
+ def get_sequence(
95
+ chrom: str,
96
+ start: int,
97
+ end: int,
98
+ flank5: int = 0,
99
+ flank3: int = 0,
100
+ genome: str = "hg38",
101
+ ):
102
+ """
103
+ Given genome, start position, end position, chromosome, and desired flank size, extract the raw DNA sequence
104
+ """
105
+ new_start = max(0, start - flank)
106
+ new_end = end + flank
107
+ region = f"{chrom}:{new_start}-{new_end}"
108
+ url = f"https://api.genome.ucsc.edu/getData/sequence?genome={genome};chrom={chrom};start={new_start};end={new_end}"
109
+ try:
110
+ r = requests.get(url)
111
+ r.raise_for_status()
112
+ except:
113
+ raise ValueError(f"Failed to fetch sequence for {region} in genome {genome}")
114
+
115
+ results_dict = {
116
+ "chromStart": new_start,
117
+ "chromEnd": new_end,
118
+ "seq": r.json()["dna"],
119
+ }
120
+ return results_dict
121
+
122
+
123
+ def extract_tfbs_with_context(
124
+ genome: str = "hg38",
125
+ flank5: int = 500,
126
+ flank3: int = 500,
127
+ control_run: bool = True, # if there's a flank, whether to also run without flank
128
+ out_dir: str = "../data_files/processed/tfclust",
129
+ allowed_tfs: list = None, # e.g., ['CTCF', 'MAX']
130
+ chroms: list = None,
131
+ ):
132
+ """
133
+ Loop through raw downloads and extract TF binding sites (bs) with flanks
134
+ Builds a DataFrame with all the available data for each TF. Columns = ["bin", "chrom", "chromStart", "chromEnd", "name", "score", "scoreCount", "sourceIds", "sourceScores", "seq", "seq_flanked", "chromStart_flanked", "chromEnd_flanked"]
135
+ """
136
+ # Prepare to save output
137
+ os.makedirs(out_dir, exist_ok=True)
138
+
139
+ # Get chromosomes
140
+ if chroms is None:
141
+ logging.info(
142
+ "No chromosomes provided, fetching all chromosomes for the given genome..."
143
+ )
144
+ chroms = get_all_chroms(genome)
145
+ count = 0
146
+
147
+ # Initialize the final DF
148
+ results_cols = [
149
+ "bin",
150
+ "chrom",
151
+ "chromStart",
152
+ "chromEnd",
153
+ "name",
154
+ "score",
155
+ "scoreCount",
156
+ "sourceIds",
157
+ "sourceScores",
158
+ "seq",
159
+ "seq_flanked",
160
+ "chromStart_flanked",
161
+ "chromEnd_flanked",
162
+ "flank5",
163
+ "flank3",
164
+ ]
165
+ results_init = pd.DataFrame(columns=results_cols)
166
+
167
+ # Make a list of the types of runs we need
168
+ queries = [{"flank5": flank5, "flank3": flank3}]
169
+ if not ((flank5 == 0) and (flank3 == 0) and control_run):
170
+ queries.append({"type": "control", "flank5": 0, "flank3": 0})
171
+ queries[0]["type"] = "flank"
172
+ elif (flank5 == 0) and (flank3 == 0):
173
+ queries[0]["type"] = "control"
174
+
175
+ # For each chromosome, download the encRegTfbsClustered track, extract the features, and fetch the sequences
176
+ # Loop through chroms
177
+ for chrom in chroms:
178
+ results_init.to_csv(
179
+ f"{out_dir}/encRegTfbsClustered_{genome}_{chrom}.csv", index=False
180
+ )
181
+ logging.info(f"Fetching {chrom}...")
182
+ # Fetch the data json (has start and end positions in the chrom, but not the sequence)
183
+ try:
184
+ data = fetch_tfbs_track(chrom, genome=genome)
185
+ logging.info(f" → Fetched {chrom} successfully")
186
+ features = data.get("encRegTfbsClustered", {})
187
+ logging.info(f" → Found {len(features)} features")
188
+ except Exception as e:
189
+ logging.info(f" Failed to fetch {chrom}: {e}")
190
+ continue
191
+
192
+ # Get the sequences of the DNA binding sites
193
+ for feature_no, feature in enumerate(features):
194
+ # Initialize new results row
195
+ new_row = {}
196
+
197
+ # Check if tf is valid
198
+ tf_name = feature.get("name", "UnknownTF")
199
+ if allowed_tfs and tf_name not in allowed_tfs:
200
+ continue
201
+ else:
202
+ logging.warning(f"TF name {tf_name} not in allowed_tfs. Skipping.")
203
+ # Make sure the chromosomes match and we have the right sequence!
204
+ assert (
205
+ feature["chrom"] == chrom
206
+ ), f"Chromosome mismatch: {feature['chrom']} != {chrom}"
207
+
208
+ # Add all the cols already in the json, add
209
+ for c in results_cols:
210
+ if c in feature:
211
+ new_row[c] = feature[c]
212
+
213
+ ### Extract sequence
214
+ start = feature["chromStart"]
215
+ end = feature["chromEnd"]
216
+
217
+ for query in queries:
218
+ try:
219
+ results_dict = get_sequence(
220
+ chrom,
221
+ start,
222
+ end,
223
+ flank5=query["flank5"],
224
+ flank3=query["flank3"],
225
+ genome=genome,
226
+ )
227
+ logging.info(
228
+ f" Success on feat. {feature_no} {chrom}:{start}-{end}, type {query['type']}"
229
+ )
230
+ # Add the returned info
231
+ if type == "control":
232
+ new_row["seq"] = results_dict["seq"]
233
+ else:
234
+ new_row["seq_flanked"] = results_dict["seq"]
235
+ new_row["chromStart_flanked"] = results_dict["chromStart"]
236
+ new_row["chromEnd_flanked"] = results_dict["chromEnd"]
237
+ new_row["flank5"] = flank5
238
+ new_row["flank3"] = flank3
239
+ count += 1
240
+ except Exception as e:
241
+ logging.info(
242
+ f" Skipped feat. {feature_no} {chrom}:{start}-{end} due to error: {e}"
243
+ )
244
+ continue
245
+
246
+ sleep(0.05) # Stay within UCSC's 20 req/sec rate limit
247
+
248
+ # Fill out any blank columns
249
+ for c in results_cols:
250
+ if c not in new_row:
251
+ new_row[c] = None
252
+
253
+ new_row_df = pd.DataFrame(data=new_row, columns=results_cols)
254
+ if new_row_df["seq"] is not None:
255
+ new_row_df.to_csv(
256
+ f"{out_dir}/encRegTfbsClustered_{chrom}.csv",
257
+ mode="a",
258
+ index=False,
259
+ header=False,
260
+ )
261
+ logging.info(
262
+ f"Wrote new row to {out_dir}/encRegTfbsClustered_{chrom}.csv"
263
+ )
264
+
265
+ logging.info(f"Done. Wrote {count} sequences to {output}")
266
+
267
+
268
+ # Thread function for one chromosome
269
+ def process_chrom(
270
+ chrom: str = "chr1",
271
+ genome: str = "hg38",
272
+ flank5: int = 500,
273
+ flank3: int = 500,
274
+ control_run: bool = True,
275
+ out_dir: str = "../data_files/processed/tfclust",
276
+ allowed_tfs: list = None,
277
+ max_cpu_frac: float = None,
278
+ ):
279
+ """
280
+ Called within parallel method to strat a thread
281
+ """
282
+ logging.info(f"Starting thread for {chrom}")
283
+ try:
284
+ extract_tfbs_with_context(
285
+ genome=genome,
286
+ flank5=flank5,
287
+ flank3=flank3,
288
+ control_run=control_run,
289
+ out_dir=out_dir,
290
+ allowed_tfs=allowed_tfs,
291
+ chroms=[chrom], # important: wrap in list
292
+ )
293
+ logging.info(f"Finished {chrom}")
294
+ except Exception as e:
295
+ logging.error(f"Error processing {chrom}: {e}")
296
+
297
+
298
+ def parallel_extract_tfbs_with_context(
299
+ genome: str = "hg38",
300
+ flank5: int = 500,
301
+ flank3: int = 500,
302
+ control_run: bool = True,
303
+ out_dir: str = "../data_files/processed/tfclust",
304
+ allowed_tfs: list = None,
305
+ chroms: list = None,
306
+ max_cpu_frac: float = None,
307
+ ):
308
+ """
309
+ Call extract_tfbs_with_context() using multithreading, one thread per chromosome.
310
+ """
311
+ # Get all chromosomes if not supplied
312
+ if chroms is None:
313
+ chroms = get_all_chroms(genome=genome)
314
+
315
+ # Determine max workers
316
+ max_workers = len(chroms)
317
+ max_available = int(multiprocessing.cpu_count())
318
+ if max_cpu_frac is not None:
319
+ max_available = int(multiprocessing.cpu_count() * max_cpu_frac)
320
+ max_workers = min(max_workers, max_available)
321
+ logging.info(
322
+ f"{max_available} CPU cores available. Using {max_workers} threads for genome {genome}..."
323
+ )
324
+
325
+ # Launch threads
326
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
327
+ futures = {executor.submit(process_chrom, chrom): chrom for chrom in chroms}
328
+ for future in as_completed(futures):
329
+ chrom = futures[future]
330
+ try:
331
+ future.result()
332
+ except Exception as e:
333
+ logging.error(f"Chromosome {chrom} raised an exception: {e}")
334
+
335
+
336
+ def main():
337
+ genomes = ["hg38", "hg19"]
338
+ frac_per_genome = round(1 / len(genomes), 1)
339
+ for genome in genomes:
340
+ all_chroms = get_all_chroms(genome=genome)
341
+ parallel_extract_tfbs_with_context(
342
+ genome=genome,
343
+ flank5=500,
344
+ flank3=500,
345
+ control_run=True, # if there's a flank, whether to also run without flank
346
+ out_dir=f"../data_files/processed/tfclust/{genome}",
347
+ allowed_tfs=None, # e.g., ['CTCF', 'MAX']
348
+ chroms=None,
349
+ max_cpu_frac=frac_per_genome,
350
+ )
351
+
352
+
353
+ if __name__ == "__main__":
354
+ logger = logging.getLogger(__name__)
355
+ logging.basicConfig(
356
+ filename="download.log",
357
+ encoding="utf-8",
358
+ level=logging.DEBUG,
359
+ filemode="w",
360
+ )