Upload 3 files

Browse files

Files changed (4) hide show

.gitattributes +1 -0
experimentList.tab +3 -0
full_data_loading.py +72 -0
smaller_data_loading.py +140 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+experimentList.tab filter=lfs diff=lfs merge=lfs -text

experimentList.tab ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2a0ddc152403c55eacad60e5ece0b7ca35d90bd6d18129687323ac812be6233
+size 344940779

full_data_loading.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import pandas as pd
+from pathlib import Path
+import subprocess
+#Read only cols 0–2, no header
+df = pd.read_csv(
+    "experimentList.tab",
+    sep="\t",
+    header=None,
+    usecols=[0,1,2],
+    names=["exp_id","genome","assay_group"],
+    engine="python",
+    on_bad_lines="skip",
+    dtype=str
+)
+#Keep only known genome assemblies
+VALID_GENOMES = {
+    "hg19","hg38",
+    "mm9","mm10",
+    "rn6",
+    "dm3","dm6",
+    "ce10","ce11",
+    "sacCer3"
+}
+df = df[df["genome"].isin(VALID_GENOMES)]
+print("Assemblies in filtered data:", df["genome"].unique())
+#Classify assay type
+def modality(track):
+    t = track.lower()
+    if "atac"    in t: return "ATAC"
+    if "dnase"   in t: return "DNase"
+    if "bisulfite" in t or "methyl" in t: return "BS"
+    return "ChIP"
+df["modality"] = df["assay_group"].apply(modality)
+#URL templates
+def make_urls(exp, genome, mod):
+    urls = []
+    if mod in ("ChIP","ATAC","DNase"):
+        urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bw/{exp}.bw")
+        for thr in ("05","10","20"):
+            urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bed{thr}/{exp}.{thr}.bed")
+            urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bb{thr}/{exp}.{thr}.bb")
+    else:
+        urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/methyl/{exp}.methyl.bw")
+        urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/cover/{exp}.cover.bw")
+        for sub in ("hmr","pmd","hypermr"):
+            urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/{sub}/Bed/{exp}.{sub}.bed")
+            urls.append(f"https://chip-atlas.dbcls.jp/data/{genome}/eachData/bs/{sub}/BigBed/{exp}.{sub}.bb")
+    return urls
+#Write URL lists per genome
+urls_dir = Path("urls_by_genome"); urls_dir.mkdir(exist_ok=True)
+for genome, group in df.groupby("genome"):
+    all_urls = []
+    for _, row in group.iterrows():
+        all_urls += make_urls(row.exp_id, genome, row.modality)
+    uniq = sorted(set(all_urls))
+    (urls_dir/f"urls_{genome}.txt").write_text("\n".join(uniq))
+    print(f"{genome}: {len(uniq)} URLs")
+#Download into raw/{genome}/
+for url_file in urls_dir.glob("urls_*.txt"):
+    genome = url_file.stem.split("_",1)[1]
+    dest = Path("raw")/genome
+    dest.mkdir(parents=True, exist_ok=True)
+    print(f"\nDownloading {genome} → {dest}/…")
+    subprocess.run(["wget","-nc","-i",str(url_file),"-P",str(dest)], check=True)
+print("Done! Check raw/{genome}/ for your files.")

smaller_data_loading.py ADDED Viewed

	@@ -0,0 +1,140 @@

+#!/usr/bin/env python3
+import os, sys, zipfile
+import subprocess
+import random
+from pathlib import Path
+import requests
+import pandas as pd
+from tqdm import tqdm
+# ─── PARAMETERS ───────────────────────────────────────────────────────────────
+# total target regions (rough guide; you'll filter post‐download if needed)
+TARGET_REGIONS = 200_000
+# Assemblies to include
+ASSEMBLIES = ["hg19","hg38","mm9","mm10","rn6","dm3","dm6","ce10","ce11","sacCer3"]
+# How many experiments to sample at most per protein (tune up/down)
+MAX_EXPS_PER_PROTEIN = 50
+# Number of parallel connections for aria2c
+ARIA2C_CONN = 16
+# Working directories
+WORKDIR = Path("chip_atlas_fetch")
+WORKDIR.mkdir(exist_ok=True)
+LIST_DIR = WORKDIR / "lists"
+LIST_DIR.mkdir(exist_ok=True)
+DL_DIR   = WORKDIR / "downloads"
+DL_DIR.mkdir(exist_ok=True)
+# ─── HELPERS ──────────────────────────────────────────────────────────────────
+def download_and_extract(url, extract_to: Path):
+    """Fetch a ZIP and unzip it."""
+    local = extract_to / Path(url).name
+    if not local.exists():
+        print(f"→ Downloading {url}")
+        resp = requests.get(url, stream=True); resp.raise_for_status()
+        with open(local, "wb") as f:
+            for chunk in resp.iter_content(1<<20):
+                f.write(chunk)
+    with zipfile.ZipFile(local, "r") as z:
+        z.extractall(extract_to)
+# ─── 1) GET MASTER LISTS ────────────────────────────────────────────────────
+print("1) Fetching master file & experiment lists…")
+FILELIST_URL      = "https://dbarchive.biosciencedbc.jp/data/chip-atlas/LATEST/chip_atlas_file_list.zip"
+EXPERIMENTLIST_URL= "https://dbarchive.biosciencedbc.jp/data/chip-atlas/LATEST/chip_atlas_experiment_list.zip"
+download_and_extract(FILELIST_URL, LIST_DIR)
+download_and_extract(EXPERIMENTLIST_URL, LIST_DIR)
+filelist_txt      = LIST_DIR / "chip_atlas_file_list.csv"
+experiment_txt    = LIST_DIR / "chip_atlas_experiment_list.csv"
+# ─── 2) PARSE EXPERIMENT METADATA ────────────────────────────────────────────
+print("2) Parsing experiment → protein lookup…")
+exp_df = pd.read_csv(
+    experiment_txt,
+    sep=None,            # let python engine guess (comma vs. tab)
+    engine="python",     # required when sep=None
+    encoding="latin1"    # to avoid UnicodeDecodeErrors
+)
+print("Columns in experiment list:", exp_df.columns.tolist())
+exp_df = (
+    exp_df
+    .loc[:, ['Experimental ID', 'Genome assembly', 'Antigen']]
+    .rename(columns={
+        'Experimental ID': 'exp_id',
+        'Genome assembly': 'genome',
+        'Antigen': 'assay_group'
+    })
+)
+exp_df['protein'] = exp_df['assay_group'].str.replace(r'_ChIP.*', '', regex=True)
+# Finally, filter to only the assemblies you care about:
+exp_df = exp_df[exp_df['genome'].isin(ASSEMBLIES)]
+# build lookup
+exp_to_genome  = exp_df.set_index("exp_id")["genome"].to_dict()
+exp_to_protein = exp_df.set_index("exp_id")["protein"].to_dict()
+# ─── 3) BUILD URL LIST DIRECTLY ───────────────────────────────────────────────
+print("3) Building URL list for .bw + .10.bed…")
+BASE = "https://dbarchive.biosciencedbc.jp/data/chip-atlas"
+urls_by_exp = {}
+for exp, genome in exp_to_genome.items():
+    urls_by_exp[exp] = [
+        f"{BASE}/data/{genome}/eachData/bw/{exp}.bw",
+        f"{BASE}/data/{genome}/eachData/bed10/{exp}.10.bed"
+    ]
+# bucket experiments by protein
+from collections import defaultdict
+prot_exps = defaultdict(list)
+for exp, prot in exp_to_protein.items():
+    if exp in urls_by_exp:
+        prot_exps[prot].append(exp)
+# sample up to MAX_EXPS_PER_PROTEIN per protein
+sampled_exps = []
+for prot, exps in prot_exps.items():
+    k = min(len(exps), MAX_EXPS_PER_PROTEIN)
+    sampled_exps += random.sample(exps, k)
+print(f" → Sampling {len(sampled_exps):,} experiments across {len(prot_exps)} proteins")
+# collect URLs for just those experiments
+final_urls = []
+for exp in sampled_exps:
+    final_urls += urls_by_exp[exp]
+random.shuffle(final_urls)
+# write out for aria2c
+url_list_file = WORKDIR / "to_download.txt"
+with open(url_list_file, "w") as f:
+    for u in final_urls:
+        f.write(u + "\n")
+print(f" → Wrote {len(final_urls):,} URLs to {url_list_file}")
+# ─── 4) PARALLEL DOWNLOAD VIA aria2c ─────────────────────────────────────────
+print("4) Downloading with aria2c…")
+subprocess.run([
+    "aria2c",
+    f"-x{ARIA2C_CONN}",
+    "--dir", str(DL_DIR),
+    "--input-file", str(url_list_file),
+    "--auto-file-renaming=false",
+    "--allow-overwrite=true"
+], check=True)
+print("✅ Finished downloading all selected files.")
+print(f"Your files are in: {DL_DIR.resolve()}")