Spaces:

encryptd
/

ocr_vlm_thinking

Paused

App Files Files Community

encryptd commited on Apr 1

Commit

511d3b2

1 Parent(s): 29918ac

prog update

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +51 -49
requirements.txt +2 -1

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 💬
 colorFrom: yellow
 colorTo: purple
 sdk: gradio
-sdk_version: 4.44.1
 app_file: app.py
 pinned: false
 hf_oauth: true

 colorFrom: yellow
 colorTo: purple
 sdk: gradio
+sdk_version: 5.42.0
 app_file: app.py
 pinned: false
 hf_oauth: true

app.py CHANGED Viewed

@@ -5,15 +5,15 @@ import httpx
 import json
 import base64
 from io import BytesIO
-from fastapi import FastAPI, Request
-from fastapi.responses import StreamingResponse, JSONResponse, RedirectResponse
 import uvicorn
 import gradio as gr
 from openai import OpenAI
 # --- CONFIGURATION ---
 MODEL_ID = "numind/NuMarkdown-8B-Thinking"
-GPU_UTILIZATION = 0.95
 MAX_MODEL_LEN = 16384
 VLLM_PORT = 8000
 HF_PORT = 7860
@@ -22,8 +22,8 @@ HF_PORT = 7860
 def start_vllm():
     if "VLLM_PID" in os.environ:
         return
-    print("--- STARTING VLLM ---")
-    command =[
         "python3", "-m", "vllm.entrypoints.openai.api_server",
         "--model", MODEL_ID,
         "--host", "127.0.0.1",
@@ -34,51 +34,22 @@ def start_vllm():
         "--dtype", "bfloat16",
         "--limit-mm-per-prompt", '{"image": 1}'
     ]
-    proc = subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr)
-    os.environ["VLLM_PID"] = str(proc.pid)
 start_vllm()
-# --- STEP 2: FASTAPI GATEKEEPER (EXTERNAL API) ---
-app = FastAPI()
-@app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
-async def gatekeeper_proxy(path: str, request: Request):
-    """Intercepts external Docling API calls and strips reasoning tags."""
-    target_url = f"http://127.0.0.1:{VLLM_PORT}/v1/{path}"
-    headers = {k: v for k, v in request.headers.items() if k.lower() not in ["host", "content-length"]}
-    async with httpx.AsyncClient(timeout=300.0) as client:
-        try:
-            if path == "chat/completions" and request.method == "POST":
-                body = await request.json()
-                if not body.get("stream", False):
-                    resp = await client.post(target_url, headers=headers, json=body)
-                    if resp.status_code == 200:
-                        data = resp.json()
-                        content = data["choices"][0]["message"].get("content", "")
-                        # SUPPRESS REASONING TAGS FOR EXTERNAL API
-                        if "</think>" in content:
-                            data["choices"][0]["message"]["content"] = content.split("</think>")[-1].strip()
-                        return JSONResponse(content=data)
-                    return JSONResponse(status_code=resp.status_code, content=resp.json())
-            proxy_req = client.build_request(request.method, target_url, headers=headers, content=await request.body())
-            r = await client.send(proxy_req, stream=True)
-            return StreamingResponse(r.aiter_raw(), status_code=r.status_code, headers=dict(r.headers))
-        except Exception as e:
-            return JSONResponse(status_code=503, content={"error": str(e)})
-# --- STEP 3: GRADIO UI ---
 def run_ui_test(image, prompt):
-    print(">>> UI Request Received")
     if image is None: return "⚠️ Please upload an image."
     try:
         with httpx.Client() as check:
             check.get(f"http://127.0.0.1:{VLLM_PORT}/v1/models", timeout=2.0)
-    except Exception:
-        return "⏳ Model is still loading (takes 3-5 mins). Please wait."
     client = OpenAI(base_url=f"http://127.0.0.1:{VLLM_PORT}/v1", api_key="EMPTY")
     try:
@@ -89,21 +60,21 @@ def run_ui_test(image, prompt):
         completion = client.chat.completions.create(
             model=MODEL_ID,
-            messages=[{"role": "user", "content":[
                 {"type": "text", "text": prompt or "Convert to markdown."},
                 {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
             ]}],
-            max_tokens=4096,
             timeout=300.0
         )
         content = completion.choices[0].message.content
         return content.split("</think>")[-1].strip() if "</think>" in content else content
     except Exception as e:
         return f"❌ Error: {str(e)}"
-with gr.Blocks(title="NuMarkdown API Server") as demo:
     gr.Markdown("# NuMarkdown L40S API Server")
-    gr.Markdown("The Markdown API endpoint is live externally at `/v1/chat/completions`.")
     with gr.Row():
         with gr.Column():
             img_input = gr.Image(type="pil", label="Input Document")
@@ -114,12 +85,43 @@ with gr.Blocks(title="NuMarkdown API Server") as demo:
     btn.click(run_ui_test, inputs=[img_input, txt_input], outputs=[out])
-# --- STEP 4: SAFE MOUNTING ---
-# We mount Gradio to a subpath to prevent it from fighting with the Root path and breaking CSS/JS.
-# Start Gradio's internal state
 demo.queue()
-# This creates the FastAPI instance Gradio uses
 app = demo.app
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=HF_PORT, workers=1)

 import json
 import base64
 from io import BytesIO
+from fastapi import Request
+from fastapi.responses import StreamingResponse, JSONResponse
 import uvicorn
 import gradio as gr
 from openai import OpenAI
 # --- CONFIGURATION ---
 MODEL_ID = "numind/NuMarkdown-8B-Thinking"
+GPU_UTILIZATION = 0.85
 MAX_MODEL_LEN = 16384
 VLLM_PORT = 8000
 HF_PORT = 7860
 def start_vllm():
     if "VLLM_PID" in os.environ:
         return
+    print("🚀 Starting vLLM engine...")
+    command = [
         "python3", "-m", "vllm.entrypoints.openai.api_server",
         "--model", MODEL_ID,
         "--host", "127.0.0.1",
         "--dtype", "bfloat16",
         "--limit-mm-per-prompt", '{"image": 1}'
     ]
+    # Connect vLLM logs to the HF console logs
+    subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr)
+    os.environ["VLLM_PID"] = "running"
 start_vllm()
+# --- STEP 2: UI LOGIC ---
 def run_ui_test(image, prompt):
     if image is None: return "⚠️ Please upload an image."
+    # Internal check for vLLM
     try:
         with httpx.Client() as check:
             check.get(f"http://127.0.0.1:{VLLM_PORT}/v1/models", timeout=2.0)
+    except:
+        return "⏳ Model is still loading... please wait 3-5 minutes."
     client = OpenAI(base_url=f"http://127.0.0.1:{VLLM_PORT}/v1", api_key="EMPTY")
     try:
         completion = client.chat.completions.create(
             model=MODEL_ID,
+            messages=[{"role": "user", "content": [
                 {"type": "text", "text": prompt or "Convert to markdown."},
                 {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
             ]}],
             timeout=300.0
         )
         content = completion.choices[0].message.content
+        # Suppress reasoning for UI
         return content.split("</think>")[-1].strip() if "</think>" in content else content
     except Exception as e:
         return f"❌ Error: {str(e)}"
+with gr.Blocks(title="NuMarkdown API") as demo:
     gr.Markdown("# NuMarkdown L40S API Server")
+    gr.Markdown("The API is live at `/v1/chat/completions` (Reasoning stripped automatically).")
     with gr.Row():
         with gr.Column():
             img_input = gr.Image(type="pil", label="Input Document")
     btn.click(run_ui_test, inputs=[img_input, txt_input], outputs=[out])
+# --- STEP 3: ATTACH PROXY TO GRADIO'S APP ---
+# We enable the queue for long tasks
 demo.queue()
+# We get the FastAPI instance from Gradio
 app = demo.app
+# We add the external API proxy directly to this app
+@app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
+async def gatekeeper_proxy(path: str, request: Request):
+    target_url = f"http://127.0.0.1:{VLLM_PORT}/v1/{path}"
+    # Strip Host and Content-Length to prevent routing loops on HF
+    headers = {k: v for k, v in request.headers.items() if k.lower() not in ["host", "content-length"]}
+    async with httpx.AsyncClient(timeout=300.0) as client:
+        try:
+            if path == "chat/completions" and request.method == "POST":
+                body = await request.json()
+                if not body.get("stream", False):
+                    resp = await client.post(target_url, headers=headers, json=body)
+                    if resp.status_code == 200:
+                        data = resp.json()
+                        content = data["choices"][0]["message"].get("content", "")
+                        # STRIP THINKING FROM EXTERNAL DOCLING API
+                        if "</think>" in content:
+                            data["choices"][0]["message"]["content"] = content.split("</think>")[-1].strip()
+                        return JSONResponse(content=data)
+                    return JSONResponse(status_code=resp.status_code, content=resp.json())
+            # Fallback for models list, etc.
+            proxy_req = client.build_request(request.method, target_url, headers=headers, content=await request.body())
+            r = await client.send(proxy_req, stream=True)
+            return StreamingResponse(r.aiter_raw(), status_code=r.status_code, headers=dict(r.headers))
+        except Exception as e:
+            return JSONResponse(status_code=503, content={"error": f"API Proxy Error: {str(e)}"})
+# --- STEP 4: RUN ---
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=HF_PORT, workers=1)

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ gradio
 openai
 fastapi
 uvicorn
-httpx

 openai
 fastapi
 uvicorn
+httpx
+audioop-lts