encryptd commited on
Commit
511d3b2
·
1 Parent(s): 29918ac

prog update

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +51 -49
  3. requirements.txt +2 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 💬
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: false
10
  hf_oauth: true
 
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.42.0
8
  app_file: app.py
9
  pinned: false
10
  hf_oauth: true
app.py CHANGED
@@ -5,15 +5,15 @@ import httpx
5
  import json
6
  import base64
7
  from io import BytesIO
8
- from fastapi import FastAPI, Request
9
- from fastapi.responses import StreamingResponse, JSONResponse, RedirectResponse
10
  import uvicorn
11
  import gradio as gr
12
  from openai import OpenAI
13
 
14
  # --- CONFIGURATION ---
15
  MODEL_ID = "numind/NuMarkdown-8B-Thinking"
16
- GPU_UTILIZATION = 0.95
17
  MAX_MODEL_LEN = 16384
18
  VLLM_PORT = 8000
19
  HF_PORT = 7860
@@ -22,8 +22,8 @@ HF_PORT = 7860
22
  def start_vllm():
23
  if "VLLM_PID" in os.environ:
24
  return
25
- print("--- STARTING VLLM ---")
26
- command =[
27
  "python3", "-m", "vllm.entrypoints.openai.api_server",
28
  "--model", MODEL_ID,
29
  "--host", "127.0.0.1",
@@ -34,51 +34,22 @@ def start_vllm():
34
  "--dtype", "bfloat16",
35
  "--limit-mm-per-prompt", '{"image": 1}'
36
  ]
37
- proc = subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr)
38
- os.environ["VLLM_PID"] = str(proc.pid)
 
39
 
40
  start_vllm()
41
 
42
- # --- STEP 2: FASTAPI GATEKEEPER (EXTERNAL API) ---
43
- app = FastAPI()
44
-
45
- @app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
46
- async def gatekeeper_proxy(path: str, request: Request):
47
- """Intercepts external Docling API calls and strips reasoning tags."""
48
- target_url = f"http://127.0.0.1:{VLLM_PORT}/v1/{path}"
49
- headers = {k: v for k, v in request.headers.items() if k.lower() not in ["host", "content-length"]}
50
-
51
- async with httpx.AsyncClient(timeout=300.0) as client:
52
- try:
53
- if path == "chat/completions" and request.method == "POST":
54
- body = await request.json()
55
- if not body.get("stream", False):
56
- resp = await client.post(target_url, headers=headers, json=body)
57
- if resp.status_code == 200:
58
- data = resp.json()
59
- content = data["choices"][0]["message"].get("content", "")
60
- # SUPPRESS REASONING TAGS FOR EXTERNAL API
61
- if "</think>" in content:
62
- data["choices"][0]["message"]["content"] = content.split("</think>")[-1].strip()
63
- return JSONResponse(content=data)
64
- return JSONResponse(status_code=resp.status_code, content=resp.json())
65
-
66
- proxy_req = client.build_request(request.method, target_url, headers=headers, content=await request.body())
67
- r = await client.send(proxy_req, stream=True)
68
- return StreamingResponse(r.aiter_raw(), status_code=r.status_code, headers=dict(r.headers))
69
- except Exception as e:
70
- return JSONResponse(status_code=503, content={"error": str(e)})
71
-
72
- # --- STEP 3: GRADIO UI ---
73
  def run_ui_test(image, prompt):
74
- print(">>> UI Request Received")
75
  if image is None: return "⚠️ Please upload an image."
76
 
 
77
  try:
78
  with httpx.Client() as check:
79
  check.get(f"http://127.0.0.1:{VLLM_PORT}/v1/models", timeout=2.0)
80
- except Exception:
81
- return "⏳ Model is still loading (takes 3-5 mins). Please wait."
82
 
83
  client = OpenAI(base_url=f"http://127.0.0.1:{VLLM_PORT}/v1", api_key="EMPTY")
84
  try:
@@ -89,21 +60,21 @@ def run_ui_test(image, prompt):
89
 
90
  completion = client.chat.completions.create(
91
  model=MODEL_ID,
92
- messages=[{"role": "user", "content":[
93
  {"type": "text", "text": prompt or "Convert to markdown."},
94
  {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
95
  ]}],
96
- max_tokens=4096,
97
  timeout=300.0
98
  )
99
  content = completion.choices[0].message.content
 
100
  return content.split("</think>")[-1].strip() if "</think>" in content else content
101
  except Exception as e:
102
  return f"❌ Error: {str(e)}"
103
 
104
- with gr.Blocks(title="NuMarkdown API Server") as demo:
105
  gr.Markdown("# NuMarkdown L40S API Server")
106
- gr.Markdown("The Markdown API endpoint is live externally at `/v1/chat/completions`.")
107
  with gr.Row():
108
  with gr.Column():
109
  img_input = gr.Image(type="pil", label="Input Document")
@@ -114,12 +85,43 @@ with gr.Blocks(title="NuMarkdown API Server") as demo:
114
 
115
  btn.click(run_ui_test, inputs=[img_input, txt_input], outputs=[out])
116
 
117
- # --- STEP 4: SAFE MOUNTING ---
118
- # We mount Gradio to a subpath to prevent it from fighting with the Root path and breaking CSS/JS.
119
- # Start Gradio's internal state
120
  demo.queue()
121
- # This creates the FastAPI instance Gradio uses
 
122
  app = demo.app
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  if __name__ == "__main__":
125
  uvicorn.run(app, host="0.0.0.0", port=HF_PORT, workers=1)
 
5
  import json
6
  import base64
7
  from io import BytesIO
8
+ from fastapi import Request
9
+ from fastapi.responses import StreamingResponse, JSONResponse
10
  import uvicorn
11
  import gradio as gr
12
  from openai import OpenAI
13
 
14
  # --- CONFIGURATION ---
15
  MODEL_ID = "numind/NuMarkdown-8B-Thinking"
16
+ GPU_UTILIZATION = 0.85
17
  MAX_MODEL_LEN = 16384
18
  VLLM_PORT = 8000
19
  HF_PORT = 7860
 
22
  def start_vllm():
23
  if "VLLM_PID" in os.environ:
24
  return
25
+ print("🚀 Starting vLLM engine...")
26
+ command = [
27
  "python3", "-m", "vllm.entrypoints.openai.api_server",
28
  "--model", MODEL_ID,
29
  "--host", "127.0.0.1",
 
34
  "--dtype", "bfloat16",
35
  "--limit-mm-per-prompt", '{"image": 1}'
36
  ]
37
+ # Connect vLLM logs to the HF console logs
38
+ subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr)
39
+ os.environ["VLLM_PID"] = "running"
40
 
41
  start_vllm()
42
 
43
+ # --- STEP 2: UI LOGIC ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  def run_ui_test(image, prompt):
 
45
  if image is None: return "⚠️ Please upload an image."
46
 
47
+ # Internal check for vLLM
48
  try:
49
  with httpx.Client() as check:
50
  check.get(f"http://127.0.0.1:{VLLM_PORT}/v1/models", timeout=2.0)
51
+ except:
52
+ return "⏳ Model is still loading... please wait 3-5 minutes."
53
 
54
  client = OpenAI(base_url=f"http://127.0.0.1:{VLLM_PORT}/v1", api_key="EMPTY")
55
  try:
 
60
 
61
  completion = client.chat.completions.create(
62
  model=MODEL_ID,
63
+ messages=[{"role": "user", "content": [
64
  {"type": "text", "text": prompt or "Convert to markdown."},
65
  {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
66
  ]}],
 
67
  timeout=300.0
68
  )
69
  content = completion.choices[0].message.content
70
+ # Suppress reasoning for UI
71
  return content.split("</think>")[-1].strip() if "</think>" in content else content
72
  except Exception as e:
73
  return f"❌ Error: {str(e)}"
74
 
75
+ with gr.Blocks(title="NuMarkdown API") as demo:
76
  gr.Markdown("# NuMarkdown L40S API Server")
77
+ gr.Markdown("The API is live at `/v1/chat/completions` (Reasoning stripped automatically).")
78
  with gr.Row():
79
  with gr.Column():
80
  img_input = gr.Image(type="pil", label="Input Document")
 
85
 
86
  btn.click(run_ui_test, inputs=[img_input, txt_input], outputs=[out])
87
 
88
+ # --- STEP 3: ATTACH PROXY TO GRADIO'S APP ---
89
+ # We enable the queue for long tasks
 
90
  demo.queue()
91
+
92
+ # We get the FastAPI instance from Gradio
93
  app = demo.app
94
 
95
+ # We add the external API proxy directly to this app
96
+ @app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
97
+ async def gatekeeper_proxy(path: str, request: Request):
98
+ target_url = f"http://127.0.0.1:{VLLM_PORT}/v1/{path}"
99
+
100
+ # Strip Host and Content-Length to prevent routing loops on HF
101
+ headers = {k: v for k, v in request.headers.items() if k.lower() not in ["host", "content-length"]}
102
+
103
+ async with httpx.AsyncClient(timeout=300.0) as client:
104
+ try:
105
+ if path == "chat/completions" and request.method == "POST":
106
+ body = await request.json()
107
+ if not body.get("stream", False):
108
+ resp = await client.post(target_url, headers=headers, json=body)
109
+ if resp.status_code == 200:
110
+ data = resp.json()
111
+ content = data["choices"][0]["message"].get("content", "")
112
+ # STRIP THINKING FROM EXTERNAL DOCLING API
113
+ if "</think>" in content:
114
+ data["choices"][0]["message"]["content"] = content.split("</think>")[-1].strip()
115
+ return JSONResponse(content=data)
116
+ return JSONResponse(status_code=resp.status_code, content=resp.json())
117
+
118
+ # Fallback for models list, etc.
119
+ proxy_req = client.build_request(request.method, target_url, headers=headers, content=await request.body())
120
+ r = await client.send(proxy_req, stream=True)
121
+ return StreamingResponse(r.aiter_raw(), status_code=r.status_code, headers=dict(r.headers))
122
+ except Exception as e:
123
+ return JSONResponse(status_code=503, content={"error": f"API Proxy Error: {str(e)}"})
124
+
125
+ # --- STEP 4: RUN ---
126
  if __name__ == "__main__":
127
  uvicorn.run(app, host="0.0.0.0", port=HF_PORT, workers=1)
requirements.txt CHANGED
@@ -3,4 +3,5 @@ gradio
3
  openai
4
  fastapi
5
  uvicorn
6
- httpx
 
 
3
  openai
4
  fastapi
5
  uvicorn
6
+ httpx
7
+ audioop-lts