Upload 3 files
Browse files- .gitattributes +1 -0
- experimentList.tab +3 -0
- full_data_loading.py +72 -0
- smaller_data_loading.py +140 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
experimentList.tab filter=lfs diff=lfs merge=lfs -text
|
experimentList.tab
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d2a0ddc152403c55eacad60e5ece0b7ca35d90bd6d18129687323ac812be6233
|
| 3 |
+
size 344940779
|
full_data_loading.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import subprocess
|
| 4 |
+
|
| 5 |
+
#Read only cols 0β2, no header
|
| 6 |
+
df = pd.read_csv(
|
| 7 |
+
"experimentList.tab",
|
| 8 |
+
sep="\t",
|
| 9 |
+
header=None,
|
| 10 |
+
usecols=[0,1,2],
|
| 11 |
+
names=["exp_id","genome","assay_group"],
|
| 12 |
+
engine="python",
|
| 13 |
+
on_bad_lines="skip",
|
| 14 |
+
dtype=str
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
#Keep only known genome assemblies
|
| 18 |
+
VALID_GENOMES = {
|
| 19 |
+
"hg19","hg38",
|
| 20 |
+
"mm9","mm10",
|
| 21 |
+
"rn6",
|
| 22 |
+
"dm3","dm6",
|
| 23 |
+
"ce10","ce11",
|
| 24 |
+
"sacCer3"
|
| 25 |
+
}
|
| 26 |
+
df = df[df["genome"].isin(VALID_GENOMES)]
|
| 27 |
+
print("Assemblies in filtered data:", df["genome"].unique())
|
| 28 |
+
|
| 29 |
+
#Classify assay type
|
| 30 |
+
def modality(track):
|
| 31 |
+
t = track.lower()
|
| 32 |
+
if "atac" in t: return "ATAC"
|
| 33 |
+
if "dnase" in t: return "DNase"
|
| 34 |
+
if "bisulfite" in t or "methyl" in t: return "BS"
|
| 35 |
+
return "ChIP"
|
| 36 |
+
df["modality"] = df["assay_group"].apply(modality)
|
| 37 |
+
|
| 38 |
+
#URL templates
|
| 39 |
+
def make_urls(exp, genome, mod):
|
| 40 |
+
urls = []
|
| 41 |
+
if mod in ("ChIP","ATAC","DNase"):
|
| 42 |
+
urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bw/{exp}.bw")
|
| 43 |
+
for thr in ("05","10","20"):
|
| 44 |
+
urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bed{thr}/{exp}.{thr}.bed")
|
| 45 |
+
urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bb{thr}/{exp}.{thr}.bb")
|
| 46 |
+
else:
|
| 47 |
+
urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/methyl/{exp}.methyl.bw")
|
| 48 |
+
urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/cover/{exp}.cover.bw")
|
| 49 |
+
for sub in ("hmr","pmd","hypermr"):
|
| 50 |
+
urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/{sub}/Bed/{exp}.{sub}.bed")
|
| 51 |
+
urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/{sub}/BigBed/{exp}.{sub}.bb")
|
| 52 |
+
return urls
|
| 53 |
+
|
| 54 |
+
#Write URL lists per genome
|
| 55 |
+
urls_dir = Path("urls_by_genome"); urls_dir.mkdir(exist_ok=True)
|
| 56 |
+
for genome, group in df.groupby("genome"):
|
| 57 |
+
all_urls = []
|
| 58 |
+
for _, row in group.iterrows():
|
| 59 |
+
all_urls += make_urls(row.exp_id, genome, row.modality)
|
| 60 |
+
uniq = sorted(set(all_urls))
|
| 61 |
+
(urls_dir/f"urls_{genome}.txt").write_text("\n".join(uniq))
|
| 62 |
+
print(f"{genome}: {len(uniq)} URLs")
|
| 63 |
+
|
| 64 |
+
#Download into raw/{genome}/
|
| 65 |
+
for url_file in urls_dir.glob("urls_*.txt"):
|
| 66 |
+
genome = url_file.stem.split("_",1)[1]
|
| 67 |
+
dest = Path("raw")/genome
|
| 68 |
+
dest.mkdir(parents=True, exist_ok=True)
|
| 69 |
+
print(f"\nDownloading {genome} β {dest}/β¦")
|
| 70 |
+
subprocess.run(["wget","-nc","-i",str(url_file),"-P",str(dest)], check=True)
|
| 71 |
+
|
| 72 |
+
print("Done! Check raw/{genome}/ for your files.")
|
smaller_data_loading.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import os, sys, zipfile
|
| 3 |
+
import subprocess
|
| 4 |
+
import random
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import requests
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
|
| 10 |
+
# βββ PARAMETERS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 11 |
+
# total target regions (rough guide; you'll filter postβdownload if needed)
|
| 12 |
+
TARGET_REGIONS = 200_000
|
| 13 |
+
|
| 14 |
+
# Assemblies to include
|
| 15 |
+
ASSEMBLIES = ["hg19","hg38","mm9","mm10","rn6","dm3","dm6","ce10","ce11","sacCer3"]
|
| 16 |
+
|
| 17 |
+
# How many experiments to sample at most per protein (tune up/down)
|
| 18 |
+
MAX_EXPS_PER_PROTEIN = 50
|
| 19 |
+
|
| 20 |
+
# Number of parallel connections for aria2c
|
| 21 |
+
ARIA2C_CONN = 16
|
| 22 |
+
|
| 23 |
+
# Working directories
|
| 24 |
+
WORKDIR = Path("chip_atlas_fetch")
|
| 25 |
+
WORKDIR.mkdir(exist_ok=True)
|
| 26 |
+
LIST_DIR = WORKDIR / "lists"
|
| 27 |
+
LIST_DIR.mkdir(exist_ok=True)
|
| 28 |
+
DL_DIR = WORKDIR / "downloads"
|
| 29 |
+
DL_DIR.mkdir(exist_ok=True)
|
| 30 |
+
|
| 31 |
+
# βββ HELPERS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
+
|
| 33 |
+
def download_and_extract(url, extract_to: Path):
|
| 34 |
+
"""Fetch a ZIP and unzip it."""
|
| 35 |
+
local = extract_to / Path(url).name
|
| 36 |
+
if not local.exists():
|
| 37 |
+
print(f"β Downloading {url}")
|
| 38 |
+
resp = requests.get(url, stream=True); resp.raise_for_status()
|
| 39 |
+
with open(local, "wb") as f:
|
| 40 |
+
for chunk in resp.iter_content(1<<20):
|
| 41 |
+
f.write(chunk)
|
| 42 |
+
with zipfile.ZipFile(local, "r") as z:
|
| 43 |
+
z.extractall(extract_to)
|
| 44 |
+
|
| 45 |
+
# βββ 1) GET MASTER LISTS ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 46 |
+
|
| 47 |
+
print("1) Fetching master file & experiment listsβ¦")
|
| 48 |
+
FILELIST_URL = "https://dbarchive.biosciencedbc.jp/data/chip-atlas/LATEST/chip_atlas_file_list.zip"
|
| 49 |
+
EXPERIMENTLIST_URL= "https://dbarchive.biosciencedbc.jp/data/chip-atlas/LATEST/chip_atlas_experiment_list.zip"
|
| 50 |
+
|
| 51 |
+
download_and_extract(FILELIST_URL, LIST_DIR)
|
| 52 |
+
download_and_extract(EXPERIMENTLIST_URL, LIST_DIR)
|
| 53 |
+
|
| 54 |
+
filelist_txt = LIST_DIR / "chip_atlas_file_list.csv"
|
| 55 |
+
experiment_txt = LIST_DIR / "chip_atlas_experiment_list.csv"
|
| 56 |
+
|
| 57 |
+
# βββ 2) PARSE EXPERIMENT METADATA ββββββββββββββββββββββββββββββββββββββββββββ
|
| 58 |
+
|
| 59 |
+
print("2) Parsing experiment β protein lookupβ¦")
|
| 60 |
+
exp_df = pd.read_csv(
|
| 61 |
+
experiment_txt,
|
| 62 |
+
sep=None, # let python engine guess (comma vs. tab)
|
| 63 |
+
engine="python", # required when sep=None
|
| 64 |
+
encoding="latin1" # to avoid UnicodeDecodeErrors
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
print("Columns in experiment list:", exp_df.columns.tolist())
|
| 68 |
+
|
| 69 |
+
exp_df = (
|
| 70 |
+
exp_df
|
| 71 |
+
.loc[:, ['Experimental ID', 'Genome assembly', 'Antigen']]
|
| 72 |
+
.rename(columns={
|
| 73 |
+
'Experimental ID': 'exp_id',
|
| 74 |
+
'Genome assembly': 'genome',
|
| 75 |
+
'Antigen': 'assay_group'
|
| 76 |
+
})
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
exp_df['protein'] = exp_df['assay_group'].str.replace(r'_ChIP.*', '', regex=True)
|
| 80 |
+
|
| 81 |
+
# Finally, filter to only the assemblies you care about:
|
| 82 |
+
exp_df = exp_df[exp_df['genome'].isin(ASSEMBLIES)]
|
| 83 |
+
|
| 84 |
+
# build lookup
|
| 85 |
+
exp_to_genome = exp_df.set_index("exp_id")["genome"].to_dict()
|
| 86 |
+
exp_to_protein = exp_df.set_index("exp_id")["protein"].to_dict()
|
| 87 |
+
|
| 88 |
+
# βββ 3) BUILD URL LIST DIRECTLY βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 89 |
+
|
| 90 |
+
print("3) Building URL list for .bw + .10.bedβ¦")
|
| 91 |
+
BASE = "https://dbarchive.biosciencedbc.jp/data/chip-atlas"
|
| 92 |
+
urls_by_exp = {}
|
| 93 |
+
for exp, genome in exp_to_genome.items():
|
| 94 |
+
urls_by_exp[exp] = [
|
| 95 |
+
f"{BASE}/data/{genome}/eachData/bw/{exp}.bw",
|
| 96 |
+
f"{BASE}/data/{genome}/eachData/bed10/{exp}.10.bed"
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
# bucket experiments by protein
|
| 100 |
+
from collections import defaultdict
|
| 101 |
+
prot_exps = defaultdict(list)
|
| 102 |
+
for exp, prot in exp_to_protein.items():
|
| 103 |
+
if exp in urls_by_exp:
|
| 104 |
+
prot_exps[prot].append(exp)
|
| 105 |
+
|
| 106 |
+
# sample up to MAX_EXPS_PER_PROTEIN per protein
|
| 107 |
+
sampled_exps = []
|
| 108 |
+
for prot, exps in prot_exps.items():
|
| 109 |
+
k = min(len(exps), MAX_EXPS_PER_PROTEIN)
|
| 110 |
+
sampled_exps += random.sample(exps, k)
|
| 111 |
+
|
| 112 |
+
print(f" β Sampling {len(sampled_exps):,} experiments across {len(prot_exps)} proteins")
|
| 113 |
+
|
| 114 |
+
# collect URLs for just those experiments
|
| 115 |
+
final_urls = []
|
| 116 |
+
for exp in sampled_exps:
|
| 117 |
+
final_urls += urls_by_exp[exp]
|
| 118 |
+
random.shuffle(final_urls)
|
| 119 |
+
|
| 120 |
+
# write out for aria2c
|
| 121 |
+
url_list_file = WORKDIR / "to_download.txt"
|
| 122 |
+
with open(url_list_file, "w") as f:
|
| 123 |
+
for u in final_urls:
|
| 124 |
+
f.write(u + "\n")
|
| 125 |
+
print(f" β Wrote {len(final_urls):,} URLs to {url_list_file}")
|
| 126 |
+
|
| 127 |
+
# βββ 4) PARALLEL DOWNLOAD VIA aria2c βββββββββββββββββββββββββββββββββββββββββ
|
| 128 |
+
|
| 129 |
+
print("4) Downloading with aria2cβ¦")
|
| 130 |
+
subprocess.run([
|
| 131 |
+
"aria2c",
|
| 132 |
+
f"-x{ARIA2C_CONN}",
|
| 133 |
+
"--dir", str(DL_DIR),
|
| 134 |
+
"--input-file", str(url_list_file),
|
| 135 |
+
"--auto-file-renaming=false",
|
| 136 |
+
"--allow-overwrite=true"
|
| 137 |
+
], check=True)
|
| 138 |
+
|
| 139 |
+
print("β
Finished downloading all selected files.")
|
| 140 |
+
print(f"Your files are in: {DL_DIR.resolve()}")
|