Spaces:

Neon-coding
/

Cor

Running

App Files Files Community

Neon-tech commited on 2 days ago

Commit

e7fd235

verified ·

1 Parent(s): c5e0d0e

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -264

app.py CHANGED Viewed

@@ -1,286 +1,140 @@
 import os
 import json
-import time
-import socket
-import threading
 import requests
 import pyarrow.parquet as pq
 import gc
 from pathlib import Path
-from huggingface_hub import HfApi
 # ── Config ───────────────────────────────────────────────────────────────────
-HF_TOKEN       = os.environ.get("HF_TOKEN")
-RAW_DIR        = "/data/raw"
-STATE_FILE     = "/data/state.json"
-WORKER_TIMEOUT = 700
-MAX_BUFFERED   = 999999
-os.makedirs(RAW_DIR, exist_ok=True)
-api          = HfApi(token=HF_TOKEN)
 AUTH_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
-# ── Sources ───────────────────────────────────────────────────────────────────
-SOURCES = [
-    {
-        "name"    : "fineweb",
-        "type"    : "hf_list",
-        "repo"    : "HuggingFaceFW/fineweb-edu",
-        "prefix"  : "data/CC-MAIN-2025-26",
-        "skip"    : 5,
-        "take"    : 10,
-        "text_col": "text",
-    },
-    {
-        "name"    : "wikipedia",
-        "type"    : "hf_list",
-        "repo"    : "wikimedia/wikipedia",
-        "prefix"  : "20231101.en/train-",
-        "skip"    : 2,
-        "take"    : 18,
-        "text_col": "text",
-    },
-    {
-        "name"    : "openwebmath",
-        "type"    : "hf_list",
-        "repo"    : "open-web-math/open-web-math",
-        "prefix"  : "data/train-",
-        "skip"    : 0,
-        "take"    : 6,
-        "text_col": "text",
-    },
-    {
-        "name"    : "code",
-        "type"    : "url_list",
-        "text_col": "text",
-        "fmt"     : "jsonl",
-        "urls"    : [
-            f"https://huggingface.co/buckets/Neon-tech/Dataset-arranger/resolve/by-language/{lang}/shard_{str(i).zfill(6)}.jsonl?download=true"
-            for lang in ["C", "C++", "Java", "Go", "Rust", "Ruby", "PHP", "SQL", "C#", "Scala", "Lua", "Perl"]
-            for i in range(2)
-        ],
-    },
 ]
-# ── Keep-alive ────────────────────────────────────────────────────────────────
-def serve():
-    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-    s.bind(("0.0.0.0", 7860))
-    s.listen(5)
-    print("✓ Listening on port 7860")
-    while True:
-        conn, _ = s.accept()
-        conn.send(b"HTTP/1.1 200 OK\r\nContent-Length: 2\r\n\r\nOK")
-        conn.close()
-# ── State ─────────────────────────────────────────────────────────────────────
-def load_state():
-    if os.path.exists(STATE_FILE):
-        with open(STATE_FILE) as f:
-            state = json.load(f)
-        shards  = state["shards"]
-        queue   = state.get("queue", [])
-        done    = sum(1 for v in shards.values() if v["status"] == "done")
-        claimed = sum(1 for v in shards.values() if v["status"] == "claimed")
-        pending = sum(1 for v in shards.values() if v["status"] == "pending")
-        print(f"Resuming — {done} done / {claimed} claimed / {pending} buffered / {len(queue)} queued")
     else:
-        state = {"shards": {}, "queue": []}
-        print("Starting fresh")
-    return state
-def save_state(state):
-    tmp = STATE_FILE + ".tmp"
-    with open(tmp, "w") as f:
-        json.dump(state, f, indent=2)
-    os.replace(tmp, STATE_FILE)
-# ── Discover ──────────────────────────────────────────────────────────────────
-def discover_all(state):
-    known_urls = {v["url"] for v in state["shards"].values()} | {e["url"] for e in state.get("queue", [])}
-    new_count  = 0
-    for src in SOURCES:
-        name = src["name"]
-        print(f"\nDiscovering: {name}")
-        if src["type"] == "hf_list":
-            all_files = sorted([
-                f for f in api.list_repo_files(src["repo"], repo_type="dataset")
-                if f.startswith(src["prefix"]) and f.endswith(".parquet")
-            ])
-            selected = all_files[src["skip"]: src["skip"] + src["take"]]
-            base_url = f"https://huggingface.co/datasets/{src['repo']}/resolve/main/"
-            urls     = [base_url + f for f in selected]
-            fmt      = "parquet"
-        else:
-            urls = src["urls"]
-            fmt  = src.get("fmt", "parquet")
-        added = 0
-        for url in urls:
-            if url not in known_urls:
-                state["queue"].append({
-                    "url"      : url,
-                    "source"   : name,
-                    "text_col" : src["text_col"],
-                    "fmt"      : fmt,
-                })
-                known_urls.add(url)
-                new_count += 1
-                added     += 1
-        print(f"  {name}: {len(urls)} files | {added} new added to queue")
-    save_state(state)
-    print(f"\nTotal queued: {len(state['queue'])} | In state: {len(state['shards'])}")
-# ── Reclaim stale ─────────────────────────────────────────────────────────────
-def reclaim_stale(state):
-    now       = time.time()
-    reclaimed = 0
-    for name, info in state["shards"].items():
-        if info["status"] == "claimed" and info.get("claimed_at"):
-            if now - info["claimed_at"] > WORKER_TIMEOUT:
-                print(f"  ⚠ Reclaiming: {name}")
-                info["status"]     = "pending"
-                info["worker"]     = None
-                info["claimed_at"] = None
-                reclaimed         += 1
-    if reclaimed:
-        save_state(state)
-# ── Parquet → JSONL ───────────────────────────────────────────────────────────
-def parquet_to_jsonl(parquet_path, jsonl_path, text_col):
-    """Stream parquet batch by batch → write one JSON line per doc. No full load."""
-    pf        = pq.ParquetFile(parquet_path)
-    n_written = 0
-    with open(jsonl_path, "w", encoding="utf-8") as out:
-        for batch in pf.iter_batches(batch_size=1_000, columns=[text_col]):
-            texts = batch.column(text_col).to_pylist()
-            for text in texts:
-                if text and isinstance(text, str) and text.strip():
-                    out.write(json.dumps({"text": text.strip()}, ensure_ascii=False) + "\n")
                     n_written += 1
-            del texts
-            gc.collect()
-    return n_written
-# ── Download loop ─────────────────────────────────────────────────────────────
-def download_loop(state):
-    while True:
-        try:
-            with open(STATE_FILE) as f:
-                fresh = json.load(f)
-            state["shards"] = fresh["shards"]
-            state["queue"]  = fresh.get("queue", [])
-        except Exception:
-            pass
-        reclaim_stale(state)
-        buffered = sum(1 for v in state["shards"].values() if v["status"] == "pending")
-        if buffered >= MAX_BUFFERED:
-            time.sleep(30)
-            continue
-        if not state["queue"]:
-            done  = sum(1 for v in state["shards"].values() if v["status"] == "done")
-            total = len(state["shards"])
-            if done == total and total > 0:
-                print("✓ All shards complete!")
-                break
-            print("  Queue empty — sleeping...")
-            time.sleep(60)
-            continue
-        entry    = state["queue"][0]
-        url      = entry["url"]
-        source   = entry["source"]
-        text_col = entry["text_col"]
-        fmt      = entry.get("fmt", "parquet")
-        lang       = url.split("?")[0].split("/")[-2]
-        base_name  = url.split("?")[0].split("/")[-1].replace(".parquet", "").replace(".jsonl", "")
-        shard_name = f"{source}__{base_name}_{lang}.jsonl"
-        jsonl_path = Path(RAW_DIR) / shard_name
-        tmp_path   = Path(RAW_DIR) / f"{shard_name}.tmp"
-        print(f"  Downloading: {source} | {base_name}")
-        try:
-            resp = requests.get(url, headers=AUTH_HEADERS, timeout=300, stream=True)
-            resp.raise_for_status()
-            with open(tmp_path, "wb") as f:
-                for chunk in resp.iter_content(chunk_size=8 * 1024 * 1024):
-                    f.write(chunk)
-        except Exception as e:
-            print(f"  ✗ Download failed: {e} — retrying in 30s")
             tmp_path.unlink(missing_ok=True)
-            time.sleep(30)
-            continue
-        if fmt == "parquet":
-            print(f"  Converting → jsonl: {shard_name}")
-            try:
-                n = parquet_to_jsonl(tmp_path, jsonl_path, text_col)
-                tmp_path.unlink(missing_ok=True)
-                print(f"  ✓ {n:,} docs")
-            except Exception as e:
-                print(f"  ✗ Convert failed: {e}")
-                tmp_path.unlink(missing_ok=True)
-                jsonl_path.unlink(missing_ok=True)
-                time.sleep(30)
-                continue
         else:
-            tmp_path.rename(jsonl_path)
-        state["queue"].pop(0)
-        state["shards"][shard_name] = {
-            "status"    : "pending",
-            "url"       : url,
-            "source"    : source,
-            "worker"    : None,
-            "claimed_at": None,
-            "error"     : None,
-        }
-        save_state(state)
-        print(f"  ✓ Ready: {shard_name}")
-        time.sleep(3)
-# ── Monitor ───────────────────────────────────────────────────────────────────
-def monitor_loop():
-    while True:
-        time.sleep(120)
-        try:
-            with open(STATE_FILE) as f:
-                s = json.load(f)
-            shards  = s["shards"]
-            queue   = s.get("queue", [])
-            done    = sum(1 for v in shards.values() if v["status"] == "done")
-            claimed = sum(1 for v in shards.values() if v["status"] == "claimed")
-            pending = sum(1 for v in shards.values() if v["status"] == "pending")
-            total   = len(shards) + len(queue)
-            pct     = (done / total * 100) if total else 0
-            src_done = {}
-            for v in shards.values():
-                src = v.get("source", "?")
-                if v["status"] == "done":
-                    src_done[src] = src_done.get(src, 0) + 1
-            print(f"[MONITOR] {done}/{total} ({pct:.1f}%) | {claimed} active | {pending} buffered | {len(queue)} queued")
-            for src, cnt in sorted(src_done.items()):
-                print(f"  {src}: {cnt} done")
-        except Exception:
-            pass
-# ── Entry point ───────────────────────────────────────────────────────────────
 if __name__ == "__main__":
-    threading.Thread(target=serve, daemon=True).start()
-    state = load_state()
-    discover_all(state)
-    threading.Thread(target=monitor_loop, daemon=True).start()
-    threading.Thread(target=download_loop, args=(state,), daemon=True).start()
-    while True:
-        time.sleep(60)

 import os
 import json
 import requests
 import pyarrow.parquet as pq
 import gc
 from pathlib import Path
 # ── Config ───────────────────────────────────────────────────────────────────
+HF_TOKEN = os.environ.get("HF_TOKEN")
+OUT_DIR  = "/data/raw"
+OUT_FILE = "/data/raw/phi__programming_books.jsonl"
+os.makedirs(OUT_DIR, exist_ok=True)
 AUTH_HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
+PHI_URLS = [
+    "https://huggingface.co/datasets/open-phi/programming_books_llama/resolve/main/data/train-00000-of-00004-ea05c5cb63b570a8.parquet?download=true",
+    "https://huggingface.co/datasets/open-phi/programming_books_llama/resolve/main/data/train-00001-of-00004-d99cbe052bab0d4e.parquet?download=true",
+    "https://huggingface.co/datasets/open-phi/programming_books_llama/resolve/main/data/train-00002-of-00004-2c25f0e11d537eaf.parquet?download=true",
+    "https://huggingface.co/datasets/open-phi/programming_books_llama/resolve/main/data/train-00003-of-00004-faa8dbb07e5f02e8.parquet?download=true",
 ]
+# Fields to extract in order (model excluded)
+FIELDS = ["topic", "outline", "queries", "context", "markdown"]
+# ── Helpers ───────────────────────────────────────────────────────────────────
+def download_file(url, path):
+    r = requests.get(url, headers=AUTH_HEADERS, timeout=300, stream=True)
+    r.raise_for_status()
+    with open(path, "wb") as f:
+        for chunk in r.iter_content(chunk_size=8 * 1024 * 1024):
+            f.write(chunk)
+    print(f"  Downloaded: {os.path.getsize(path)/1e6:.0f} MB")
+def field_to_text(field_name, value):
+    """Convert a field value to clean text."""
+    if not value:
+        return ""
+    if isinstance(value, str):
+        return value.strip()
+    if isinstance(value, list):
+        # Join list items with newline
+        return "\n".join(str(item).strip() for item in value if item)
+    return str(value).strip()
+def entry_to_text(row):
+    """Concatenate all useful fields into one coherent document."""
+    parts = []
+    for field in FIELDS:
+        val  = row.get(field, None)
+        text = field_to_text(field, val)
+        if text:
+            parts.append(text)
+    return "\n\n".join(parts)
+# ── Main ──────────────────────────────────────────────────────────────────────
+def process_phi():
+    tmp_path   = Path(OUT_DIR) / "phi_tmp.parquet"
+    n_written  = 0
+    n_skipped  = 0
+    # Resume — if output already exists, count lines
+    if Path(OUT_FILE).exists():
+        with open(OUT_FILE) as f:
+            existing = sum(1 for _ in f)
+        print(f"Resuming — {existing:,} entries already written")
     else:
+        existing = 0
+    entry_idx = 0
+    with open(OUT_FILE, "a", encoding="utf-8") as fout:
+        for url in PHI_URLS:
+            fname = url.split("?")[0].split("/")[-1]
+            print(f"\nDownloading: {fname}")
+            download_file(url, tmp_path)
+            pf = pq.ParquetFile(tmp_path)
+            # Discover available columns
+            available = pf.schema_arrow.names
+            use_cols  = [f for f in FIELDS if f in available]
+            print(f"  Available fields: {available}")
+            print(f"  Using: {use_cols}")
+            for batch in pf.iter_batches(batch_size=500, columns=use_cols):
+                rows = batch.to_pydict()
+                n    = len(batch)
+                for i in range(n):
+                    entry_idx += 1
+                    if entry_idx <= existing:
+                        continue  # skip already written
+                    row  = {col: rows[col][i] for col in use_cols}
+                    text = entry_to_text(row)
+                    if not text or len(text.strip()) < 50:
+                        n_skipped += 1
+                        continue
+                    fout.write(json.dumps({"text": text}, ensure_ascii=False) + "\n")
                     n_written += 1
+                del rows; gc.collect()
             tmp_path.unlink(missing_ok=True)
+            print(f"  ✓ {fname} done | written so far: {n_written + existing:,}")
+    total = n_written + existing
+    print(f"\n✓ Phi processing complete")
+    print(f"  Total entries : {total:,}")
+    print(f"  Skipped       : {n_skipped:,}")
+    print(f"  Output        : {OUT_FILE}")
+    print(f"  Size          : {os.path.getsize(OUT_FILE)/1e6:.0f} MB")
+    # Add to state.json so workers pick it up
+    state_file = "/data/state.json"
+    if os.path.exists(state_file):
+        with open(state_file) as f:
+            state = json.load(f)
+        shard_name = "phi__programming_books.jsonl"
+        if shard_name not in state["shards"]:
+            state["shards"][shard_name] = {
+                "status"    : "pending",
+                "url"       : PHI_URLS[0],  # reference only
+                "source"    : "phi",
+                "worker"    : None,
+                "claimed_at": None,
+                "error"     : None,
+            }
+            tmp = state_file + ".tmp"
+            with open(tmp, "w") as f:
+                json.dump(state, f, indent=2)
+            os.replace(tmp, state_file)
+            print(f"  ✓ Added to state.json — workers will pick up automatically")
         else:
+            print(f"  Already in state.json")
 if __name__ == "__main__":
+    process_phi()