Spaces:

NOT-OMEGA
/

LogAI-Engine

Sleeping

App Files Files Community

NOT-OMEGA commited on Apr 15

Commit

2d11b15

verified ·

1 Parent(s): 0d4acf4

Update classify.py

Browse files

Files changed (1) hide show

classify.py +33 -36

classify.py CHANGED Viewed

@@ -1,20 +1,12 @@
 """
-classify.py — 3-Tier Hybrid Pipeline (V10 — Thread-Safe & Shared Cache)
-Architecture:
-  LegacyCRM → LLM directly
-  Others    → Regex → BERT (batch) → LLM fallback
-Changes in V10:
-  - Removed buggy ProcessPoolExecutor (Fixes fork deadlocks & memory spikes).
-  - Global ThreadPoolExecutor for LLM (Fixes thread thrashing & context switching).
-  - LRU Cache is now genuinely shared across the entire run.
 """
 from __future__ import annotations
 import os
 import time
 import statistics
 import pandas as pd
 from functools import lru_cache
 from concurrent.futures import ThreadPoolExecutor
 from processor_regex import classify_with_regex
@@ -24,9 +16,6 @@ from processor_llm   import classify_with_llm
 # ── Config ──────────────────────────────────────────────────────────────────
 LEGACY_SOURCE = os.getenv("LEGACY_SOURCE", "LegacyCRM")
-# FIX: One global pool to prevent OS thread thrashing per chunk.
-_llm_executor = ThreadPoolExecutor(max_workers=min(32, (os.cpu_count() or 1) * 4))
 # ── Result type ─────────────────────────────────────────────────────────────
 def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
     return {
@@ -36,25 +25,20 @@ def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
         "latency_ms": round(latency_ms, 4),
     }
-# ── Caching Layer (Now Global) ──────────────────────────────────────────────
-@lru_cache(maxsize=500000)
 def cached_llm_call(log_msg: str) -> str:
-    """Executes the expensive LLM call only if the string misses the cache."""
     return classify_with_llm(log_msg)
-# ── Single log (backward-compatible) ────────────────────────────────────────
-def classify_log(source: str, log_msg: str) -> dict:
-    results = classify_logs([(source, log_msg)])
-    return results[0]
-# ── Batch pipeline (main entry point) ───────────────────────────────────────
 def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
     n       = len(logs)
     results = [None] * n
     llm_indices   = []
     bert_indices  = []
     for i, (source, log_msg) in enumerate(logs):
         if source == LEGACY_SOURCE:
             llm_indices.append(i)
@@ -68,10 +52,9 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             else:
                 bert_indices.append(i)
-    # ── Step 2: BERT batch (ONNX handles its own multi-threading) ───────────
     if bert_indices:
         bert_msgs = [logs[i][1] for i in bert_indices]
         t_bert_start = time.perf_counter()
         bert_results = bert_batch(bert_msgs)
         t_bert_end   = time.perf_counter()
@@ -84,11 +67,10 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             else:
                 llm_indices.append(idx)
-    # ── Step 3: LLM (I/O Bound - Using Global Thread Pool) ──────────────────
     if llm_indices:
         def parallel_llm(idx):
             src, msg = logs[idx]
             t_llm_0 = time.perf_counter()
             label = cached_llm_call(msg)
             t_llm_ms = (time.perf_counter() - t_llm_0) * 1000
@@ -98,32 +80,47 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             return idx, _make_result(label, tier, None, t_llm_ms)
-        # Delegate entirely to the pre-warmed global thread pool
-        futures = [_llm_executor.submit(parallel_llm, idx) for idx in llm_indices]
-        for future in futures:
-            idx, res = future.result()
-            results[idx] = res
     return results
 def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
-    """Single-process batch processing (relying on ONNX C++ threads + Python network threads)"""
     df = pd.read_csv(input_path)
     required = {"source", "log_message"}
     if not required.issubset(df.columns):
-        raise ValueError(f"Missing required columns in CSV. Expected: {required}. Found: {set(df.columns)}")
     log_pairs = list(zip(df["source"], df["log_message"]))
     total_logs = len(log_pairs)
-    print(f"🔥 Processing {total_logs} logs (Thread Pool active for LLMs)...")
     t_start = time.perf_counter()
-    # Process everything in one go - let classify_logs handle the internal batching
-    results = classify_logs(log_pairs)
     t_end = time.perf_counter()
     print(f"⏱️ True Wall-Clock Processing Time: {(t_end - t_start):.2f} seconds")
     df["predicted_label"] = [r["label"]       for r in results]

 """
+classify.py — 3-Tier Hybrid Pipeline (V11 — MAX SPEED + SAFE MULTIPROCESSING)
 """
 from __future__ import annotations
 import os
 import time
 import statistics
 import pandas as pd
+import multiprocessing as mp
 from functools import lru_cache
 from concurrent.futures import ThreadPoolExecutor
 from processor_regex import classify_with_regex
 # ── Config ──────────────────────────────────────────────────────────────────
 LEGACY_SOURCE = os.getenv("LEGACY_SOURCE", "LegacyCRM")
 # ── Result type ─────────────────────────────────────────────────────────────
 def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
     return {
         "latency_ms": round(latency_ms, 4),
     }
+# ── Caching Layer ───────────────────────────────────────────────────────────
+@lru_cache(maxsize=10000) # Reduced maxsize per-worker to prevent OOM
 def cached_llm_call(log_msg: str) -> str:
     return classify_with_llm(log_msg)
 def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
+    """Processes a chunk of logs."""
     n       = len(logs)
     results = [None] * n
     llm_indices   = []
     bert_indices  = []
+    # Step 1: Regex (Now running on multiple cores in parallel!)
     for i, (source, log_msg) in enumerate(logs):
         if source == LEGACY_SOURCE:
             llm_indices.append(i)
             else:
                 bert_indices.append(i)
+    # Step 2: BERT
     if bert_indices:
         bert_msgs = [logs[i][1] for i in bert_indices]
         t_bert_start = time.perf_counter()
         bert_results = bert_batch(bert_msgs)
         t_bert_end   = time.perf_counter()
             else:
                 llm_indices.append(idx)
+    # Step 3: LLM (Threaded inside each process)
     if llm_indices:
         def parallel_llm(idx):
             src, msg = logs[idx]
             t_llm_0 = time.perf_counter()
             label = cached_llm_call(msg)
             t_llm_ms = (time.perf_counter() - t_llm_0) * 1000
             return idx, _make_result(label, tier, None, t_llm_ms)
+        # Inner ThreadPool for API network requests
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            for idx, res in executor.map(parallel_llm, llm_indices):
+                results[idx] = res
     return results
+def _process_chunk(chunk: list[tuple[str, str]]) -> list[dict]:
+    """Helper function for mapping."""
+    return classify_logs(chunk)
+# ── CSV batch classify (Safe Spawn Multiprocessing) ─────────────────────────
 def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
     df = pd.read_csv(input_path)
     required = {"source", "log_message"}
     if not required.issubset(df.columns):
+        raise ValueError(f"Missing required columns in CSV.")
     log_pairs = list(zip(df["source"], df["log_message"]))
     total_logs = len(log_pairs)
+    # Use max cores for speed, but leave 1 for the OS/Gradio UI
+    safe_cores = max(1, (os.cpu_count() or 1) - 1)
+    chunk_size = 5000 # Slightly smaller chunks so data copies faster between processes
+    chunks = [log_pairs[i:i + chunk_size] for i in range(0, total_logs, chunk_size)]
+    results = []
+    print(f"🔥 Firing up {safe_cores} CPU Cores with SAFE SPAWN context...")
     t_start = time.perf_counter()
+    # FIX: Use 'spawn' context! This is the magic that prevents PyTorch/ONNX Segfaults
+    ctx = mp.get_context('spawn')
+    with ctx.ProcessPoolExecutor(max_workers=safe_cores) as executor:
+        for chunk_result in executor.map(_process_chunk, chunks):
+            results.extend(chunk_result)
     t_end = time.perf_counter()
     print(f"⏱️ True Wall-Clock Processing Time: {(t_end - t_start):.2f} seconds")
     df["predicted_label"] = [r["label"]       for r in results]