Spaces:

Sam-max1
/

nitdaa

Running

App Files Files Community

Sam-max1 commited on 21 days ago

Commit

2f1cebb

verified ·

1 Parent(s): 6cf3d23

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

NITDAA_HEALTHEXPERT_USER_GUIDE.md +1 -1
app.py +121 -154
update_sync.py +159 -0

NITDAA_HEALTHEXPERT_USER_GUIDE.md CHANGED Viewed

@@ -26,7 +26,7 @@ When a question is submitted:
 - Your question will instantly appear in the output window.
 - The system will display a **"Generating answer.. (1.2s)"** timer.
 - Once context is retrieved from the Vector and Graph databases, the LLM will stream the Markdown-formatted answer back to the screen.
-- analysis may take upto 300 seconds due to slow shared resources. please keep patience.
 - Responses remain preserved in the scrollable window for the duration of your session.
 ## 5. Security & Session Handling

 - Your question will instantly appear in the output window.
 - The system will display a **"Generating answer.. (1.2s)"** timer.
 - Once context is retrieved from the Vector and Graph databases, the LLM will stream the Markdown-formatted answer back to the screen.
+- Analysis may take upto 300 seconds due to slow shared resources. Please keep patience.
 - Responses remain preserved in the scrollable window for the duration of your session.
 ## 5. Security & Session Handling

app.py CHANGED Viewed

@@ -692,103 +692,7 @@ def process_document_pipeline(path: str, orig_name: str, tier: str, token: str,
             except OSError as e:
                 log.warning("Failed to delete %s: %s", path, e)
-def sync_he_data():
-    try:
-        import os, shutil
-        from huggingface_hub import snapshot_download
-        token = os.environ.get("HF_PRIVATE_TOKEN") or os.environ.get("HF_TOKEN")
-        if not token:
-            log.error("HF_PRIVATE_TOKEN or HF_TOKEN environment variable is not set. Dataset synchronization will be skipped.")
-            return
-        # --- 2-Way Log Sync on Startup ---
-        log_dir = Path(__file__).parent / "app" / "logs"
-        log_dir.mkdir(parents=True, exist_ok=True)
-        try:
-            if token:
-                for log_file in ["nitdaa_sessions.json", "nitdaa_summary.json"]:
-                    local_p = log_dir / log_file
-                    try:
-                        dl_path = hf_hub_download(repo_id="Sam-max1/mat_data", filename=log_file, repo_type="dataset", token=token)
-                        if os.path.exists(dl_path):
-                            # Append any existing local logs not in the remote file to the remote file, then replace local
-                            remote_lines = set(open(dl_path).readlines())
-                            if local_p.exists():
-                                for line in open(local_p).readlines():
-                                    if line not in remote_lines:
-                                        remote_lines.add(line)
-                            with open(local_p, "w") as f:
-                                for line in sorted(list(remote_lines)):
-                                    f.write(line)
-                            log.info(f"Successfully merged {log_file} from mat_data.")
-                    except Exception as e:
-                        log.warning(f"Could not download {log_file} from mat_data (it may not exist yet): {e}")
-        except Exception as e:
-            log.warning(f"Log sync failed: {e}")
-        # ---------------------------------
-        kbdocs_dir = Path(__file__).parent / "kbdocs"
-        kbdocs_dir.mkdir(exist_ok=True)
-        tmp_sync_dir = Path("/tmp/he_data_sync")
-        if tmp_sync_dir.exists():
-            shutil.rmtree(tmp_sync_dir)
-        tmp_sync_dir.mkdir(exist_ok=True)
-        log.info("Syncing fresh files from Sam-max1/he-data to local /tmp...")
-        snapshot_download(
-            repo_id="Sam-max1/he-data",
-            repo_type="dataset",
-            local_dir=str(tmp_sync_dir),
-            token=token,
-            ignore_patterns=[".git*"]
-        )
-        from pipeline import vector_store, graph_store
-        # Determine if there are differences between kbdocs and the downloaded he-data
-        local_files = {f.name: f.stat().st_size for f in kbdocs_dir.glob("*.*") if f.is_file()}
-        remote_files = {f.name: f.stat().st_size for f in tmp_sync_dir.glob("*.*") if f.is_file()}
-        is_different = False
-        if set(local_files.keys()) != set(remote_files.keys()):
-            is_different = True
-        else:
-            for k in local_files:
-                if local_files[k] != remote_files[k]:
-                    is_different = True
-                    break
-        if is_different:
-            log.info("Detected changes in Sam-max1/he-data! Purging databases and re-syncing kbdocs.")
-            # Clear DBs
-            vector_store.purge()
-            if graph_store.is_available():
-                graph_store.purge()
-            # Wipe local kbdocs and replace
-            shutil.rmtree(kbdocs_dir)
-            shutil.copytree(tmp_sync_dir, kbdocs_dir)
-            ingested_count = 0
-            for file_path in kbdocs_dir.glob("*.*"):
-                if file_path.is_file():
-                    log.info(f"Auto-ingesting file: {file_path.name}")
-                    process_document_pipeline(str(file_path), file_path.name, "foundation", "admin", delete_after=False)
-                    ingested_count += 1
-            log.info("=== Full Data Re-Ingestion Complete ===")
-        else:
-            log.info("kbdocs is completely up to date with he-data. No ingestion needed.")
-        log.info(f"Vector DB Chunks: {vector_store.count()}")
-        if graph_store.is_available():
-            stats = graph_store.get_stats()
-            log.info(f"Kuzu DB Nodes: {stats.get('nodes', 0)}, Edges: {stats.get('edges', 0)}")
-    except Exception as e:
-        log.error("Failed to sync he-data: %s", e)
-threading.Thread(target=sync_he_data, daemon=True).start()
 @app.route("/api/ingest", methods=["POST"])
 @limiter.limit("10 per minute")
@@ -1133,30 +1037,14 @@ def probe_embed():
 def start_auto_ingest_thread():
     def _auto_ingest_worker():
         global _auto_ingest_status
-        kbdocs_dir = Path(__file__).parent / "kbdocs"
-        kbdocs_dir.mkdir(parents=True, exist_ok=True)
-        hf_token = os.environ.get("HF_PRIVATE_TOKEN")
-        if hf_token:
-            import logging
-            from huggingface_hub import snapshot_download
-            try:
-                logging.info("HF_PRIVATE_TOKEN found, syncing dataset Sam-max1/he-data to %s...", kbdocs_dir)
-                snapshot_download(
-                    repo_id="Sam-max1/he-data",
-                    repo_type="dataset",
-                    local_dir=str(kbdocs_dir),
-                    token=hf_token
-                )
-                logging.info("Dataset synced successfully.")
-            except Exception as e:
-                logging.error("Failed to sync HuggingFace dataset: %s", e)
-        elif not kbdocs_dir.exists():
-            return
-        import requests, time
         log.info("Auto-ingest: waiting for LLM services to boot...")
-        # Wait up to 60s for models
         for _ in range(30):
             try:
                 r1 = requests.get(f"{config.EMBED_BASE_URL}/health", timeout=2)
@@ -1172,46 +1060,125 @@ def start_auto_ingest_thread():
             _auto_ingest_status["done"] = True
             return
-        # Check existing documents to avoid re-ingesting
-        existing = {d["source"] for d in vector_store.list_documents("admin")}
-        files_to_ingest = []
-        for f in kbdocs_dir.iterdir():
-            if f.is_file() and _allowed(f.name) and f.name not in existing:
-                files_to_ingest.append(f)
-        if not files_to_ingest:
-            log.info("Auto-ingest: no new files found in kbdocs.")
             _auto_ingest_status["done"] = True
             return
-        log.info("Auto-ingesting %d files from kbdocs...", len(files_to_ingest))
-        config.current_session.set("admin")
-        _auto_ingest_status["running"] = True
-        _auto_ingest_status["total"] = len(files_to_ingest)
-        _auto_ingest_status["completed"] = 0
-        _auto_ingest_status["results"] = []
-        _auto_ingest_status["done"] = False
-        for path in files_to_ingest:
-            _auto_ingest_status["current_file"] = path.name
-            res = process_document_pipeline(str(path), path.name, tier="foundation", token="admin", delete_after=False)
-            _auto_ingest_status["completed"] += 1
-            _auto_ingest_status["results"].append({
-                "file": path.name,
-                "ok": res["ok"],
-                "result": res["result"],
-            })
-            if res["ok"]:
-                log.info("Auto-ingest successful for %s", path.name)
-            else:
-                log.error("Auto-ingest failed for %s: %s", path.name, res["result"])
-        _auto_ingest_status["running"] = False
-        _auto_ingest_status["done"] = True
-        _auto_ingest_status["current_file"] = None
-        trigger_kv_cache_update("admin")
     threading.Thread(target=_auto_ingest_worker, daemon=True).start()

             except OSError as e:
                 log.warning("Failed to delete %s: %s", path, e)
 @app.route("/api/ingest", methods=["POST"])
 @limiter.limit("10 per minute")
 def start_auto_ingest_thread():
     def _auto_ingest_worker():
         global _auto_ingest_status
+        import requests, time, shutil, os
+        from huggingface_hub import snapshot_download, hf_hub_download
+        from pathlib import Path
+        token = os.environ.get("HF_PRIVATE_TOKEN") or os.environ.get("HF_TOKEN")
+        # --- Wait for LLM services to boot before doing anything ---
         log.info("Auto-ingest: waiting for LLM services to boot...")
         for _ in range(30):
             try:
                 r1 = requests.get(f"{config.EMBED_BASE_URL}/health", timeout=2)
             _auto_ingest_status["done"] = True
             return
+        if not token:
+            log.error("HF_PRIVATE_TOKEN or HF_TOKEN environment variable is not set. Dataset synchronization will be skipped.")
+            _auto_ingest_status["error"] = "HF Token missing"
             _auto_ingest_status["done"] = True
             return
+        # --- 2-Way Log Sync on Startup ---
+        log_dir = Path(__file__).parent / "app" / "logs"
+        log_dir.mkdir(parents=True, exist_ok=True)
+        try:
+            for log_file in ["nitdaa_sessions.json", "nitdaa_summary.json"]:
+                local_p = log_dir / log_file
+                try:
+                    dl_path = hf_hub_download(repo_id="Sam-max1/mat_data", filename=log_file, repo_type="dataset", token=token)
+                    if os.path.exists(dl_path):
+                        remote_lines = set(open(dl_path).readlines())
+                        if local_p.exists():
+                            for line in open(local_p).readlines():
+                                if line not in remote_lines:
+                                    remote_lines.add(line)
+                        with open(local_p, "w") as f:
+                            for line in sorted(list(remote_lines)):
+                                f.write(line)
+                        log.info(f"Successfully merged {log_file} from mat_data.")
+                except Exception as e:
+                    log.warning(f"Could not download {log_file} from mat_data (it may not exist yet): {e}")
+        except Exception as e:
+            log.warning(f"Log sync failed: {e}")
+        # ---------------------------------
+        kbdocs_dir = Path(__file__).parent / "kbdocs"
+        kbdocs_dir.mkdir(parents=True, exist_ok=True)
+        tmp_sync_dir = Path("/tmp/he_data_sync")
+        if tmp_sync_dir.exists():
+            shutil.rmtree(tmp_sync_dir)
+        tmp_sync_dir.mkdir(exist_ok=True)
+        log.info("Syncing fresh files from Sam-max1/he-data to local /tmp...")
+        try:
+            snapshot_download(
+                repo_id="Sam-max1/he-data",
+                repo_type="dataset",
+                local_dir=str(tmp_sync_dir),
+                token=token,
+                ignore_patterns=[".git*"]
+            )
+        except Exception as e:
+            log.error(f"Failed to download he-data dataset: {e}")
+            _auto_ingest_status["error"] = f"Download failed: {e}"
+            _auto_ingest_status["done"] = True
+            return
+        from pipeline import vector_store, graph_store
+        local_files = {f.name: f.stat().st_size for f in kbdocs_dir.glob("*.*") if f.is_file()}
+        remote_files = {f.name: f.stat().st_size for f in tmp_sync_dir.glob("*.*") if f.is_file()}
+        is_different = False
+        if set(local_files.keys()) != set(remote_files.keys()):
+            is_different = True
+        else:
+            for k in local_files:
+                if local_files[k] != remote_files[k]:
+                    is_different = True
+                    break
+        if is_different:
+            log.info("Detected changes in Sam-max1/he-data! Purging databases and re-syncing kbdocs.")
+            vector_store.purge()
+            if graph_store.is_available():
+                graph_store.purge()
+            shutil.rmtree(kbdocs_dir)
+            shutil.copytree(tmp_sync_dir, kbdocs_dir)
+            files_to_ingest = [f for f in kbdocs_dir.glob("*.*") if f.is_file() and _allowed(f.name)]
+            if not files_to_ingest:
+                log.info("No valid files to ingest in he-data.")
+                _auto_ingest_status["done"] = True
+                return
+            config.current_session.set("admin")
+            _auto_ingest_status["running"] = True
+            _auto_ingest_status["total"] = len(files_to_ingest)
+            _auto_ingest_status["completed"] = 0
+            _auto_ingest_status["results"] = []
+            _auto_ingest_status["done"] = False
+            for path in files_to_ingest:
+                _auto_ingest_status["current_file"] = path.name
+                log.info(f"Auto-ingesting file: {path.name}")
+                res = process_document_pipeline(str(path), path.name, "foundation", "admin", delete_after=False)
+                _auto_ingest_status["completed"] += 1
+                _auto_ingest_status["results"].append({
+                    "file": path.name,
+                    "ok": res["ok"],
+                    "result": res["result"],
+                })
+                if res["ok"]:
+                    log.info("Auto-ingest successful for %s", path.name)
+                else:
+                    log.error("Auto-ingest failed for %s: %s", path.name, res["result"])
+            _auto_ingest_status["running"] = False
+            _auto_ingest_status["done"] = True
+            _auto_ingest_status["current_file"] = None
+            trigger_kv_cache_update("admin")
+            log.info("=== Full Data Re-Ingestion Complete ===")
+        else:
+            log.info("kbdocs is completely up to date with he-data. No ingestion needed.")
+            _auto_ingest_status["done"] = True
+        log.info(f"Vector DB Chunks: {vector_store.count()}")
+        if graph_store.is_available():
+            stats = graph_store.get_stats()
+            log.info(f"Kuzu DB Nodes: {stats.get('nodes', 0)}, Edges: {stats.get('edges', 0)}")
     threading.Thread(target=_auto_ingest_worker, daemon=True).start()

update_sync.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import re
+with open('app.py', 'r') as f:
+    content = f.read()
+# The unified function
+new_func = """def start_auto_ingest_thread():
+    def _auto_ingest_worker():
+        global _auto_ingest_status
+        import requests, time, shutil, os
+        from huggingface_hub import snapshot_download, hf_hub_download
+        from pathlib import Path
+        token = os.environ.get("HF_PRIVATE_TOKEN") or os.environ.get("HF_TOKEN")
+        # --- Wait for LLM services to boot before doing anything ---
+        log.info("Auto-ingest: waiting for LLM services to boot...")
+        for _ in range(30):
+            try:
+                r1 = requests.get(f"{config.EMBED_BASE_URL}/health", timeout=2)
+                r2 = requests.get(f"{config.LLM_BASE_URL}/health", timeout=2)
+                if r1.status_code == 200 and r2.status_code == 200:
+                    break
+            except Exception:
+                pass
+            time.sleep(2)
+        else:
+            log.warning("Auto-ingest aborted: LLM services not online.")
+            _auto_ingest_status["error"] = "LLM services not online within 60s"
+            _auto_ingest_status["done"] = True
+            return
+        if not token:
+            log.error("HF_PRIVATE_TOKEN or HF_TOKEN environment variable is not set. Dataset synchronization will be skipped.")
+            _auto_ingest_status["error"] = "HF Token missing"
+            _auto_ingest_status["done"] = True
+            return
+        # --- 2-Way Log Sync on Startup ---
+        log_dir = Path(__file__).parent / "app" / "logs"
+        log_dir.mkdir(parents=True, exist_ok=True)
+        try:
+            for log_file in ["nitdaa_sessions.json", "nitdaa_summary.json"]:
+                local_p = log_dir / log_file
+                try:
+                    dl_path = hf_hub_download(repo_id="Sam-max1/mat_data", filename=log_file, repo_type="dataset", token=token)
+                    if os.path.exists(dl_path):
+                        remote_lines = set(open(dl_path).readlines())
+                        if local_p.exists():
+                            for line in open(local_p).readlines():
+                                if line not in remote_lines:
+                                    remote_lines.add(line)
+                        with open(local_p, "w") as f:
+                            for line in sorted(list(remote_lines)):
+                                f.write(line)
+                        log.info(f"Successfully merged {log_file} from mat_data.")
+                except Exception as e:
+                    log.warning(f"Could not download {log_file} from mat_data (it may not exist yet): {e}")
+        except Exception as e:
+            log.warning(f"Log sync failed: {e}")
+        # ---------------------------------
+        kbdocs_dir = Path(__file__).parent / "kbdocs"
+        kbdocs_dir.mkdir(parents=True, exist_ok=True)
+        tmp_sync_dir = Path("/tmp/he_data_sync")
+        if tmp_sync_dir.exists():
+            shutil.rmtree(tmp_sync_dir)
+        tmp_sync_dir.mkdir(exist_ok=True)
+        log.info("Syncing fresh files from Sam-max1/he-data to local /tmp...")
+        try:
+            snapshot_download(
+                repo_id="Sam-max1/he-data",
+                repo_type="dataset",
+                local_dir=str(tmp_sync_dir),
+                token=token,
+                ignore_patterns=[".git*"]
+            )
+        except Exception as e:
+            log.error(f"Failed to download he-data dataset: {e}")
+            _auto_ingest_status["error"] = f"Download failed: {e}"
+            _auto_ingest_status["done"] = True
+            return
+        from pipeline import vector_store, graph_store
+        local_files = {f.name: f.stat().st_size for f in kbdocs_dir.glob("*.*") if f.is_file()}
+        remote_files = {f.name: f.stat().st_size for f in tmp_sync_dir.glob("*.*") if f.is_file()}
+        is_different = False
+        if set(local_files.keys()) != set(remote_files.keys()):
+            is_different = True
+        else:
+            for k in local_files:
+                if local_files[k] != remote_files[k]:
+                    is_different = True
+                    break
+        if is_different:
+            log.info("Detected changes in Sam-max1/he-data! Purging databases and re-syncing kbdocs.")
+            vector_store.purge()
+            if graph_store.is_available():
+                graph_store.purge()
+            shutil.rmtree(kbdocs_dir)
+            shutil.copytree(tmp_sync_dir, kbdocs_dir)
+            files_to_ingest = [f for f in kbdocs_dir.glob("*.*") if f.is_file() and _allowed(f.name)]
+            if not files_to_ingest:
+                log.info("No valid files to ingest in he-data.")
+                _auto_ingest_status["done"] = True
+                return
+            config.current_session.set("admin")
+            _auto_ingest_status["running"] = True
+            _auto_ingest_status["total"] = len(files_to_ingest)
+            _auto_ingest_status["completed"] = 0
+            _auto_ingest_status["results"] = []
+            _auto_ingest_status["done"] = False
+            for path in files_to_ingest:
+                _auto_ingest_status["current_file"] = path.name
+                log.info(f"Auto-ingesting file: {path.name}")
+                res = process_document_pipeline(str(path), path.name, "foundation", "admin", delete_after=False)
+                _auto_ingest_status["completed"] += 1
+                _auto_ingest_status["results"].append({
+                    "file": path.name,
+                    "ok": res["ok"],
+                    "result": res["result"],
+                })
+                if res["ok"]:
+                    log.info("Auto-ingest successful for %s", path.name)
+                else:
+                    log.error("Auto-ingest failed for %s: %s", path.name, res["result"])
+            _auto_ingest_status["running"] = False
+            _auto_ingest_status["done"] = True
+            _auto_ingest_status["current_file"] = None
+            trigger_kv_cache_update("admin")
+            log.info("=== Full Data Re-Ingestion Complete ===")
+        else:
+            log.info("kbdocs is completely up to date with he-data. No ingestion needed.")
+            _auto_ingest_status["done"] = True
+        log.info(f"Vector DB Chunks: {vector_store.count()}")
+        if graph_store.is_available():
+            stats = graph_store.get_stats()
+            log.info(f"Kuzu DB Nodes: {stats.get('nodes', 0)}, Edges: {stats.get('edges', 0)}")
+    threading.Thread(target=_auto_ingest_worker, daemon=True).start()"""
+# Replace start_auto_ingest_thread
+content = re.sub(r'def start_auto_ingest_thread\(\):.*?    threading\.Thread\(target=_auto_ingest_worker, daemon=True\)\.start\(\)', new_func, content, flags=re.DOTALL)
+with open('app.py', 'w') as f:
+    f.write(content)