Update parser.py: handle Qwen3 thinking tokens, /no_think support
Browse files- floorplan/parser.py +19 -6
floorplan/parser.py
CHANGED
|
@@ -42,7 +42,9 @@ def _image_to_base64(image: Image.Image | str | Path) -> str:
|
|
| 42 |
|
| 43 |
|
| 44 |
def _extract_json_from_response(text: str) -> dict:
|
| 45 |
-
"""Extract JSON from a VLM response that may contain markdown code blocks."""
|
|
|
|
|
|
|
| 46 |
text = text.strip()
|
| 47 |
try:
|
| 48 |
return json.loads(text)
|
|
@@ -182,12 +184,18 @@ class FloorPlanParser:
|
|
| 182 |
def parse_image(self, image, detail="high", temperature=0.1):
|
| 183 |
"""Parse a floor plan image into the structured schema."""
|
| 184 |
image_b64 = _image_to_base64(image)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
response = self.client.chat.completions.create(
|
| 186 |
-
model=self.model, temperature=temperature, max_tokens=
|
| 187 |
messages=[
|
| 188 |
-
{"role": "system", "content":
|
| 189 |
{"role": "user", "content": [
|
| 190 |
-
{"type": "text", "text": "Parse this floor plan image. Extract ALL walls with their centerlines, thicknesses, and any doors/windows on them. Use pixel coordinates. Be thorough — every wall segment matters for room detection."},
|
| 191 |
{"type": "image_url", "image_url": {"url": image_b64, "detail": detail}},
|
| 192 |
]},
|
| 193 |
],
|
|
@@ -201,10 +209,15 @@ class FloorPlanParser:
|
|
| 201 |
orig_b64 = _image_to_base64(original_image)
|
| 202 |
overlay_b64 = _image_to_base64(overlay_image)
|
| 203 |
schema_json = current_schema.model_dump_json(indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
response = self.client.chat.completions.create(
|
| 205 |
-
model=self.model, temperature=temperature, max_tokens=
|
| 206 |
messages=[
|
| 207 |
-
{"role": "system", "content":
|
| 208 |
{"role": "user", "content": [
|
| 209 |
{"type": "text", "text": f"Iteration {iteration}. Compare these two images and identify corrections needed.\n\nCurrent schema ({len(current_schema.walls)} walls, {sum(len(w.openings) for w in current_schema.walls)} openings):\n```json\n{schema_json}\n```\n\nImage 1 is the ORIGINAL floor plan. Image 2 is the OVERLAY (parsed schema rendered on the original)."},
|
| 210 |
{"type": "image_url", "image_url": {"url": orig_b64, "detail": "high"}},
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
def _extract_json_from_response(text: str) -> dict:
|
| 45 |
+
"""Extract JSON from a VLM response that may contain markdown code blocks or thinking tokens."""
|
| 46 |
+
# Strip Qwen3-style thinking blocks
|
| 47 |
+
text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
|
| 48 |
text = text.strip()
|
| 49 |
try:
|
| 50 |
return json.loads(text)
|
|
|
|
| 184 |
def parse_image(self, image, detail="high", temperature=0.1):
|
| 185 |
"""Parse a floor plan image into the structured schema."""
|
| 186 |
image_b64 = _image_to_base64(image)
|
| 187 |
+
|
| 188 |
+
# For Qwen3 models, prepend /no_think to disable extended reasoning
|
| 189 |
+
system_content = PARSE_SYSTEM_PROMPT
|
| 190 |
+
if "Qwen3" in self.model:
|
| 191 |
+
system_content = "/no_think\n" + system_content
|
| 192 |
+
|
| 193 |
response = self.client.chat.completions.create(
|
| 194 |
+
model=self.model, temperature=temperature, max_tokens=8192,
|
| 195 |
messages=[
|
| 196 |
+
{"role": "system", "content": system_content},
|
| 197 |
{"role": "user", "content": [
|
| 198 |
+
{"type": "text", "text": "Parse this floor plan image. Extract ALL walls with their centerlines, thicknesses, and any doors/windows on them. Use pixel coordinates. Be thorough — every wall segment matters for room detection. Output ONLY the JSON, no explanation."},
|
| 199 |
{"type": "image_url", "image_url": {"url": image_b64, "detail": detail}},
|
| 200 |
]},
|
| 201 |
],
|
|
|
|
| 209 |
orig_b64 = _image_to_base64(original_image)
|
| 210 |
overlay_b64 = _image_to_base64(overlay_image)
|
| 211 |
schema_json = current_schema.model_dump_json(indent=2)
|
| 212 |
+
|
| 213 |
+
system_content = CORRECTION_SYSTEM_PROMPT
|
| 214 |
+
if "Qwen3" in self.model:
|
| 215 |
+
system_content = "/no_think\n" + system_content
|
| 216 |
+
|
| 217 |
response = self.client.chat.completions.create(
|
| 218 |
+
model=self.model, temperature=temperature, max_tokens=4096,
|
| 219 |
messages=[
|
| 220 |
+
{"role": "system", "content": system_content},
|
| 221 |
{"role": "user", "content": [
|
| 222 |
{"type": "text", "text": f"Iteration {iteration}. Compare these two images and identify corrections needed.\n\nCurrent schema ({len(current_schema.walls)} walls, {sum(len(w.openings) for w in current_schema.walls)} openings):\n```json\n{schema_json}\n```\n\nImage 1 is the ORIGINAL floor plan. Image 2 is the OVERLAY (parsed schema rendered on the original)."},
|
| 223 |
{"type": "image_url", "image_url": {"url": orig_b64, "detail": "high"}},
|