Spaces:

cronos3k
/

document-integrity-verifier

Running on Zero

App Files Files Community

cronos3k commited on Jun 4

Commit

49d5b05

verified ·

1 Parent(s): 35d3db1

Fix Tier 2 fallbacks: text-only chat model for reasoning, drop unsupported VLM provider call

Browse files

Files changed (1) hide show

app.py +28 -39

app.py CHANGED Viewed

@@ -52,6 +52,17 @@ from legal_doc_redteam.zerogpu_gui import (
 REASONING_MODEL_ID = os.environ.get("REASONING_MODEL_ID", DEFAULT_REASONING_MODEL)
 VLM_OCR_MODEL_ID = os.environ.get("VLM_OCR_MODEL_ID", DEFAULT_VLM_OCR_MODEL)
 # Defaults tightened so the @spaces.GPU slice is held only as long as needed;
 # this reduces the chance of proxy-token expiry mid-call.
 REASONING_GPU_DURATION = int(os.environ.get("REASONING_GPU_DURATION", "60"))
@@ -127,14 +138,14 @@ if spaces is not None:
                 raise RuntimeError("HF_TOKEN not set; cannot use hf_inference fallback")
             from huggingface_hub import InferenceClient
-            client = InferenceClient(model=REASONING_MODEL_ID, token=HF_TOKEN_ENV)
-            extra_body: dict = {}
             effort = (reasoning_effort or "medium").lower()
             if effort not in {"low", "off", "none", "false", "no"}:
-                # Gemma 4 / Qwen3
                 extra_body["enable_thinking"] = True
-            # gpt-oss family
-            extra_body["reasoning_effort"] = effort
             response = client.chat.completions.create(
                 messages=[
                     {"role": "system", "content": SYSTEM_INSTRUCTIONS},
@@ -247,30 +258,18 @@ if spaces is not None:
             new_tokens = outputs[0][prompt_len:]
             return _vlm_processor.decode(new_tokens, skip_special_tokens=True).strip()
-        def _vlm_chat_hf_inference(image_path, prompt: str) -> str:
-            if not HF_TOKEN_ENV:
-                raise RuntimeError("HF_TOKEN not set; cannot use hf_inference fallback")
-            from huggingface_hub import InferenceClient
-            image_bytes = Path(str(image_path)).read_bytes()
-            data_url = "data:image/png;base64," + base64.b64encode(image_bytes).decode("ascii")
-            client = InferenceClient(model=VLM_OCR_MODEL_ID, token=HF_TOKEN_ENV)
-            response = client.chat.completions.create(
-                messages=[
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "text", "text": prompt or DEFAULT_VLM_PROMPT},
-                            {"type": "image_url", "image_url": {"url": data_url}},
-                        ],
-                    }
-                ],
-                max_tokens=VLM_MAX_NEW_TOKENS,
-            )
-            return (response.choices[0].message.content or "").strip()
         def vlm_chat(image_path, prompt: str = DEFAULT_VLM_PROMPT) -> str:
-            """Three-tier resilient VLM OCR call (per page)."""
             last_exc: Exception | None = None
             for attempt in range(2):
@@ -286,17 +285,7 @@ if spaces is not None:
                     if attempt == 0 and _is_transient_gpu_error(exc):
                         continue
                     break
-            try:
-                print("[hf_zerogpu_space] VLM falling back to hf_inference",
-                      file=sys.stderr)
-                return _vlm_chat_hf_inference(image_path, prompt)
-            except Exception as exc:
-                print(
-                    f"[hf_zerogpu_space] VLM hf_inference fallback failed: "
-                    f"{type(exc).__name__}: {exc}",
-                    file=sys.stderr,
-                )
-            raise last_exc or RuntimeError("VLM unavailable (all tiers failed)")
         bind_vlm_fn(vlm_chat, model_id=VLM_OCR_MODEL_ID)
         _DEFAULT_VLM = "local_transformers"

 REASONING_MODEL_ID = os.environ.get("REASONING_MODEL_ID", DEFAULT_REASONING_MODEL)
 VLM_OCR_MODEL_ID = os.environ.get("VLM_OCR_MODEL_ID", DEFAULT_VLM_OCR_MODEL)
+# Tier 2 (HF Inference Providers) needs a model that's actually routable as
+# a chat-completion. Multimodal Gemma 4 E4B is classified as
+# image-text-to-text and rejected by the chat endpoint; we therefore use a
+# separate text-only chat model for the hf_inference fallback. Override with
+# REASONING_HF_INFERENCE_MODEL_ID if your HF account has a different model
+# enabled on Inference Providers.
+REASONING_HF_INFERENCE_MODEL_ID = os.environ.get(
+    "REASONING_HF_INFERENCE_MODEL_ID",
+    "openai/gpt-oss-20b",
+)
 # Defaults tightened so the @spaces.GPU slice is held only as long as needed;
 # this reduces the chance of proxy-token expiry mid-call.
 REASONING_GPU_DURATION = int(os.environ.get("REASONING_GPU_DURATION", "60"))
                 raise RuntimeError("HF_TOKEN not set; cannot use hf_inference fallback")
             from huggingface_hub import InferenceClient
+            client = InferenceClient(
+                model=REASONING_HF_INFERENCE_MODEL_ID,
+                token=HF_TOKEN_ENV,
+            )
             effort = (reasoning_effort or "medium").lower()
+            extra_body: dict = {"reasoning_effort": effort}
             if effort not in {"low", "off", "none", "false", "no"}:
                 extra_body["enable_thinking"] = True
             response = client.chat.completions.create(
                 messages=[
                     {"role": "system", "content": SYSTEM_INSTRUCTIONS},
             new_tokens = outputs[0][prompt_len:]
             return _vlm_processor.decode(new_tokens, skip_special_tokens=True).strip()
         def vlm_chat(image_path, prompt: str = DEFAULT_VLM_PROMPT) -> str:
+            """Resilient VLM OCR call (per page).
+            Tier 1 only — local @spaces.GPU with one retry on transient
+            ZeroGPU errors. There is no Tier 2 for the VLM: the default
+            ``nanonets/Nanonets-OCR-s`` is not hosted on HF Inference
+            Providers and trying to route it there returned
+            ``model_not_supported`` errors that just delayed the failure.
+            On VLM failure the per-page OCR loop in ``ocr_integrity``
+            records the warning and proceeds with the three CPU OCR
+            engines, which already give multi-engine page coverage.
+            """
             last_exc: Exception | None = None
             for attempt in range(2):
                     if attempt == 0 and _is_transient_gpu_error(exc):
                         continue
                     break
+            raise last_exc or RuntimeError("VLM unavailable (local GPU failed)")
         bind_vlm_fn(vlm_chat, model_id=VLM_OCR_MODEL_ID)
         _DEFAULT_VLM = "local_transformers"