Spaces:

Deign86
/

mathpulse-api-v3test

Running

Deign86 commited on 12 days ago

Commit

a8945eb

1 Parent(s): 1af7678

fix: enable DeepSeek direct streaming for chat/stream endpoint

Files changed (2) hide show

main.py CHANGED Viewed

@@ -1418,14 +1418,17 @@ async def call_hf_chat_stream_async(
                 yield str(chunk)
     client = get_inference_client()
-    async for chunk in client._call_deepseek_stream(
-        messages,
-        max_tokens=max_tokens,
-        temperature=temperature,
-        top_p=top_p,
         model=model,
         task_type=task_type,
-    ):
         yield str(chunk)

                 yield str(chunk)
     client = get_inference_client()
+    req = InferenceRequest(
+        messages=messages,
         model=model,
         task_type=task_type,
+        request_tag=f"{task_type}-async-{int(time.time() * 1000)}",
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        timeout_sec=timeout,
+    )
+    async for chunk in client._call_deepseek_stream(req):
         yield str(chunk)

services/inference_client.py CHANGED Viewed

@@ -913,9 +913,7 @@ class InferenceClient:
             "model": target_model,
             "messages": req.messages,
             "max_tokens": req.max_new_tokens or self.default_max_new_tokens,
-            "stream": True,
         }
         if target_model == REASONER_MODEL:
             params["max_tokens"] = req.max_new_tokens or 1024
         else:
@@ -932,10 +930,10 @@ class InferenceClient:
             )
             start = time.perf_counter()
             try:
-                async with client.chat.completions.stream(**params, timeout=timeout) as stream:
-                    async for event in stream:
-                        if event.type == "content.delta" and event.content:
-                            yield event.content
                 latency_ms = (time.perf_counter() - start) * 1000
                 log_model_call(

             "model": target_model,
             "messages": req.messages,
             "max_tokens": req.max_new_tokens or self.default_max_new_tokens,
         }
         if target_model == REASONER_MODEL:
             params["max_tokens"] = req.max_new_tokens or 1024
         else:
             )
             start = time.perf_counter()
             try:
+                stream = client.chat.completions.stream(**params, timeout=timeout)
+                async for chunk in stream:
+                    if chunk.choices[0].delta.content:
+                        yield chunk.choices[0].delta.content
                 latency_ms = (time.perf_counter() - start) * 1000
                 log_model_call(