Spaces:

NOT-OMEGA
/

LogAI-Engine

Sleeping

App Files Files Community

NOT-OMEGA commited on Apr 14

Commit

de30f06

verified ·

1 Parent(s): c96bf7e

Update classify.py

Browse files

Files changed (1) hide show

classify.py +13 -15

classify.py CHANGED Viewed

@@ -8,9 +8,8 @@ Architecture:
 Changes in V3:
   - Tier-wise latency tracking (regex_ms, bert_ms, llm_ms)
   - Pipeline summary with p50/p95 per tier
-  - Defensive: LLM timeout + retry baked in via processor_llm
-  - classify_logs returns richer result dict
-  - 🚀 Added ThreadPoolExecutor for Parallel LLM Processing (Zero Lag)
 """
 from __future__ import annotations
 import time
@@ -36,7 +35,7 @@ def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
 # ── Single log (backward-compatible) ────────────────────────────────────────
 def classify_log(source: str, log_msg: str) -> dict:
-    """Single log classify karo. Returns label, tier, confidence, latency_ms."""
     results = classify_logs([(source, log_msg)])
     return results[0]
@@ -60,18 +59,17 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
     # ── Step 1: Route to groups ─────────────────────────────────────────────
     llm_indices   = []
     bert_indices  = []
-    entry_times   = [time.perf_counter()] * n  # approximate per-log start
     for i, (source, log_msg) in enumerate(logs):
-        entry_times[i] = time.perf_counter()
         if source == LEGACY_SOURCE:
             llm_indices.append(i)
         else:
-            t0    = time.perf_counter()
             label = classify_with_regex(log_msg)
-            t1    = time.perf_counter()
             if label:
-                results[i] = _make_result(label, "Regex", 1.0, (t1 - t0) * 1000)
             else:
                 bert_indices.append(i)
@@ -91,7 +89,7 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             else:
                 llm_indices.append(idx)
-    # ── Step 3: LLM (Parallel Concurrency Fix) ──────────────────────────────
     if llm_indices:
         def parallel_llm(idx):
             src, msg = logs[idx]
@@ -101,8 +99,8 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             tier = "LLM" if src == LEGACY_SOURCE else "LLM (fallback)"
             return idx, _make_result(label, tier, None, t_llm_ms)
-        # 🚨 GOOGLE-LEVEL FIX: 20 threads API calls ek saath marenge!
-        with ThreadPoolExecutor(max_workers=20) as executor:
             llm_results = list(executor.map(parallel_llm, llm_indices))
         for idx, res in llm_results:
@@ -149,14 +147,14 @@ def pipeline_summary(results: list[dict]) -> dict:
 # ── CSV batch classify ───────────────────────────────────────────────────────
 def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
     """
-    CSV file classify karo.
     Required columns: 'source', 'log_message'
-    Output: adds 'predicted_label', 'tier_used', 'confidence', 'latency_ms'
     """
     df = pd.read_csv(input_path)
     required = {"source", "log_message"}
     if not required.issubset(df.columns):
-        raise ValueError(f"CSV mein ye columns chahiye: {required}. Mila: {set(df.columns)}")
     log_pairs = list(zip(df["source"], df["log_message"]))
     results   = classify_logs(log_pairs)

 Changes in V3:
   - Tier-wise latency tracking (regex_ms, bert_ms, llm_ms)
   - Pipeline summary with p50/p95 per tier
+  - Defensive: LLM timeout + circuit breaker baked in via processor_llm
+  - Parallelized LLM Tier using ThreadPoolExecutor for high throughput
 """
 from __future__ import annotations
 import time
 # ── Single log (backward-compatible) ────────────────────────────────────────
 def classify_log(source: str, log_msg: str) -> dict:
+    """Classify a single log. Returns label, tier, confidence, and latency_ms."""
     results = classify_logs([(source, log_msg)])
     return results[0]
     # ── Step 1: Route to groups ─────────────────────────────────────────────
     llm_indices   = []
     bert_indices  = []
     for i, (source, log_msg) in enumerate(logs):
         if source == LEGACY_SOURCE:
             llm_indices.append(i)
         else:
+            t_start = time.perf_counter()
             label = classify_with_regex(log_msg)
             if label:
+                latency_ms = (time.perf_counter() - t_start) * 1000
+                results[i] = _make_result(label, "Regex", 1.0, latency_ms)
             else:
                 bert_indices.append(i)
             else:
                 llm_indices.append(idx)
+    # ── Step 3: LLM (Parallel Concurrency) ──────────────────────────────────
     if llm_indices:
         def parallel_llm(idx):
             src, msg = logs[idx]
             tier = "LLM" if src == LEGACY_SOURCE else "LLM (fallback)"
             return idx, _make_result(label, tier, None, t_llm_ms)
+        # Parallelize API calls to prevent pipeline stall, restricted to 4 workers to prevent OOM
+        with ThreadPoolExecutor(max_workers=4) as executor:
             llm_results = list(executor.map(parallel_llm, llm_indices))
         for idx, res in llm_results:
 # ── CSV batch classify ───────────────────────────────────────────────────────
 def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
     """
+    Process a batch of logs from a CSV file.
     Required columns: 'source', 'log_message'
+    Output: appends 'predicted_label', 'tier_used', 'confidence', 'latency_ms'
     """
     df = pd.read_csv(input_path)
     required = {"source", "log_message"}
     if not required.issubset(df.columns):
+        raise ValueError(f"Missing required columns in CSV. Expected: {required}. Found: {set(df.columns)}")
     log_pairs = list(zip(df["source"], df["log_message"]))
     results   = classify_logs(log_pairs)