ananyakrishna commited on
Commit
ecf5d7f
Β·
verified Β·
1 Parent(s): 2676cd2

Upload 3 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ experimentList.tab filter=lfs diff=lfs merge=lfs -text
experimentList.tab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2a0ddc152403c55eacad60e5ece0b7ca35d90bd6d18129687323ac812be6233
3
+ size 344940779
full_data_loading.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pathlib import Path
3
+ import subprocess
4
+
5
+ #Read only cols 0–2, no header
6
+ df = pd.read_csv(
7
+ "experimentList.tab",
8
+ sep="\t",
9
+ header=None,
10
+ usecols=[0,1,2],
11
+ names=["exp_id","genome","assay_group"],
12
+ engine="python",
13
+ on_bad_lines="skip",
14
+ dtype=str
15
+ )
16
+
17
+ #Keep only known genome assemblies
18
+ VALID_GENOMES = {
19
+ "hg19","hg38",
20
+ "mm9","mm10",
21
+ "rn6",
22
+ "dm3","dm6",
23
+ "ce10","ce11",
24
+ "sacCer3"
25
+ }
26
+ df = df[df["genome"].isin(VALID_GENOMES)]
27
+ print("Assemblies in filtered data:", df["genome"].unique())
28
+
29
+ #Classify assay type
30
+ def modality(track):
31
+ t = track.lower()
32
+ if "atac" in t: return "ATAC"
33
+ if "dnase" in t: return "DNase"
34
+ if "bisulfite" in t or "methyl" in t: return "BS"
35
+ return "ChIP"
36
+ df["modality"] = df["assay_group"].apply(modality)
37
+
38
+ #URL templates
39
+ def make_urls(exp, genome, mod):
40
+ urls = []
41
+ if mod in ("ChIP","ATAC","DNase"):
42
+ urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bw/{exp}.bw")
43
+ for thr in ("05","10","20"):
44
+ urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bed{thr}/{exp}.{thr}.bed")
45
+ urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bb{thr}/{exp}.{thr}.bb")
46
+ else:
47
+ urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/methyl/{exp}.methyl.bw")
48
+ urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/cover/{exp}.cover.bw")
49
+ for sub in ("hmr","pmd","hypermr"):
50
+ urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/{sub}/Bed/{exp}.{sub}.bed")
51
+ urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/{sub}/BigBed/{exp}.{sub}.bb")
52
+ return urls
53
+
54
+ #Write URL lists per genome
55
+ urls_dir = Path("urls_by_genome"); urls_dir.mkdir(exist_ok=True)
56
+ for genome, group in df.groupby("genome"):
57
+ all_urls = []
58
+ for _, row in group.iterrows():
59
+ all_urls += make_urls(row.exp_id, genome, row.modality)
60
+ uniq = sorted(set(all_urls))
61
+ (urls_dir/f"urls_{genome}.txt").write_text("\n".join(uniq))
62
+ print(f"{genome}: {len(uniq)} URLs")
63
+
64
+ #Download into raw/{genome}/
65
+ for url_file in urls_dir.glob("urls_*.txt"):
66
+ genome = url_file.stem.split("_",1)[1]
67
+ dest = Path("raw")/genome
68
+ dest.mkdir(parents=True, exist_ok=True)
69
+ print(f"\nDownloading {genome} β†’ {dest}/…")
70
+ subprocess.run(["wget","-nc","-i",str(url_file),"-P",str(dest)], check=True)
71
+
72
+ print("Done! Check raw/{genome}/ for your files.")
smaller_data_loading.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os, sys, zipfile
3
+ import subprocess
4
+ import random
5
+ from pathlib import Path
6
+ import requests
7
+ import pandas as pd
8
+ from tqdm import tqdm
9
+
10
+ # ─── PARAMETERS ───────────────────────────────────────────────────────────────
11
+ # total target regions (rough guide; you'll filter post‐download if needed)
12
+ TARGET_REGIONS = 200_000
13
+
14
+ # Assemblies to include
15
+ ASSEMBLIES = ["hg19","hg38","mm9","mm10","rn6","dm3","dm6","ce10","ce11","sacCer3"]
16
+
17
+ # How many experiments to sample at most per protein (tune up/down)
18
+ MAX_EXPS_PER_PROTEIN = 50
19
+
20
+ # Number of parallel connections for aria2c
21
+ ARIA2C_CONN = 16
22
+
23
+ # Working directories
24
+ WORKDIR = Path("chip_atlas_fetch")
25
+ WORKDIR.mkdir(exist_ok=True)
26
+ LIST_DIR = WORKDIR / "lists"
27
+ LIST_DIR.mkdir(exist_ok=True)
28
+ DL_DIR = WORKDIR / "downloads"
29
+ DL_DIR.mkdir(exist_ok=True)
30
+
31
+ # ─── HELPERS ──────────────────────────────────────────────────────────────────
32
+
33
+ def download_and_extract(url, extract_to: Path):
34
+ """Fetch a ZIP and unzip it."""
35
+ local = extract_to / Path(url).name
36
+ if not local.exists():
37
+ print(f"β†’ Downloading {url}")
38
+ resp = requests.get(url, stream=True); resp.raise_for_status()
39
+ with open(local, "wb") as f:
40
+ for chunk in resp.iter_content(1<<20):
41
+ f.write(chunk)
42
+ with zipfile.ZipFile(local, "r") as z:
43
+ z.extractall(extract_to)
44
+
45
+ # ─── 1) GET MASTER LISTS ────────────────────────────────────────────────────
46
+
47
+ print("1) Fetching master file & experiment lists…")
48
+ FILELIST_URL = "https://dbarchive.biosciencedbc.jp/data/chip-atlas/LATEST/chip_atlas_file_list.zip"
49
+ EXPERIMENTLIST_URL= "https://dbarchive.biosciencedbc.jp/data/chip-atlas/LATEST/chip_atlas_experiment_list.zip"
50
+
51
+ download_and_extract(FILELIST_URL, LIST_DIR)
52
+ download_and_extract(EXPERIMENTLIST_URL, LIST_DIR)
53
+
54
+ filelist_txt = LIST_DIR / "chip_atlas_file_list.csv"
55
+ experiment_txt = LIST_DIR / "chip_atlas_experiment_list.csv"
56
+
57
+ # ─── 2) PARSE EXPERIMENT METADATA ────────────────────────────────────────────
58
+
59
+ print("2) Parsing experiment β†’ protein lookup…")
60
+ exp_df = pd.read_csv(
61
+ experiment_txt,
62
+ sep=None, # let python engine guess (comma vs. tab)
63
+ engine="python", # required when sep=None
64
+ encoding="latin1" # to avoid UnicodeDecodeErrors
65
+ )
66
+
67
+ print("Columns in experiment list:", exp_df.columns.tolist())
68
+
69
+ exp_df = (
70
+ exp_df
71
+ .loc[:, ['Experimental ID', 'Genome assembly', 'Antigen']]
72
+ .rename(columns={
73
+ 'Experimental ID': 'exp_id',
74
+ 'Genome assembly': 'genome',
75
+ 'Antigen': 'assay_group'
76
+ })
77
+ )
78
+
79
+ exp_df['protein'] = exp_df['assay_group'].str.replace(r'_ChIP.*', '', regex=True)
80
+
81
+ # Finally, filter to only the assemblies you care about:
82
+ exp_df = exp_df[exp_df['genome'].isin(ASSEMBLIES)]
83
+
84
+ # build lookup
85
+ exp_to_genome = exp_df.set_index("exp_id")["genome"].to_dict()
86
+ exp_to_protein = exp_df.set_index("exp_id")["protein"].to_dict()
87
+
88
+ # ─── 3) BUILD URL LIST DIRECTLY ───────────────────────────────────────────────
89
+
90
+ print("3) Building URL list for .bw + .10.bed…")
91
+ BASE = "https://dbarchive.biosciencedbc.jp/data/chip-atlas"
92
+ urls_by_exp = {}
93
+ for exp, genome in exp_to_genome.items():
94
+ urls_by_exp[exp] = [
95
+ f"{BASE}/data/{genome}/eachData/bw/{exp}.bw",
96
+ f"{BASE}/data/{genome}/eachData/bed10/{exp}.10.bed"
97
+ ]
98
+
99
+ # bucket experiments by protein
100
+ from collections import defaultdict
101
+ prot_exps = defaultdict(list)
102
+ for exp, prot in exp_to_protein.items():
103
+ if exp in urls_by_exp:
104
+ prot_exps[prot].append(exp)
105
+
106
+ # sample up to MAX_EXPS_PER_PROTEIN per protein
107
+ sampled_exps = []
108
+ for prot, exps in prot_exps.items():
109
+ k = min(len(exps), MAX_EXPS_PER_PROTEIN)
110
+ sampled_exps += random.sample(exps, k)
111
+
112
+ print(f" β†’ Sampling {len(sampled_exps):,} experiments across {len(prot_exps)} proteins")
113
+
114
+ # collect URLs for just those experiments
115
+ final_urls = []
116
+ for exp in sampled_exps:
117
+ final_urls += urls_by_exp[exp]
118
+ random.shuffle(final_urls)
119
+
120
+ # write out for aria2c
121
+ url_list_file = WORKDIR / "to_download.txt"
122
+ with open(url_list_file, "w") as f:
123
+ for u in final_urls:
124
+ f.write(u + "\n")
125
+ print(f" β†’ Wrote {len(final_urls):,} URLs to {url_list_file}")
126
+
127
+ # ─── 4) PARALLEL DOWNLOAD VIA aria2c ─────────────────────────────────────────
128
+
129
+ print("4) Downloading with aria2c…")
130
+ subprocess.run([
131
+ "aria2c",
132
+ f"-x{ARIA2C_CONN}",
133
+ "--dir", str(DL_DIR),
134
+ "--input-file", str(url_list_file),
135
+ "--auto-file-renaming=false",
136
+ "--allow-overwrite=true"
137
+ ], check=True)
138
+
139
+ print("βœ… Finished downloading all selected files.")
140
+ print(f"Your files are in: {DL_DIR.resolve()}")