Spaces:
Running on Zero
Running on Zero
Dedupe repeated equation blocks in region OCR output
Browse files- app.py +58 -0
- tests/test_clean_output.py +12 -0
app.py
CHANGED
|
@@ -85,8 +85,19 @@ TASK_PROMPTS = {
|
|
| 85 |
|
| 86 |
def extract_grounding_references(text):
|
| 87 |
refs = []
|
|
|
|
| 88 |
for entry in _extract_grounding_entries(text):
|
| 89 |
coord_text = repr(entry["coords"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
raw = f'<|ref|>{entry["label"]}<|/ref|><|det|>{coord_text}<|/det|>'
|
| 91 |
refs.append((raw, entry["label"], coord_text))
|
| 92 |
return refs
|
|
@@ -249,6 +260,7 @@ def clean_output(text, include_images=False):
|
|
| 249 |
text = re.sub(rf'(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?', '', text)
|
| 250 |
|
| 251 |
text = _strip_malformed_grounding(text)
|
|
|
|
| 252 |
return text.strip()
|
| 253 |
|
| 254 |
def _strip_malformed_grounding(text: str) -> str:
|
|
@@ -268,6 +280,46 @@ def _strip_malformed_grounding(text: str) -> str:
|
|
| 268 |
text = re.sub(r'<\|/?det\|>', '', text)
|
| 269 |
return text
|
| 270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
PREVIEW_CSS = """
|
| 272 |
<style>
|
| 273 |
.math-preview {
|
|
@@ -869,6 +921,7 @@ def _process_equation_lines_separately(image, infer_crop_mode=None):
|
|
| 869 |
raw_parts = [f"## Detection\n\n{detect_raw}".strip()]
|
| 870 |
refs = []
|
| 871 |
crops = []
|
|
|
|
| 872 |
|
| 873 |
for i, box in enumerate(boxes, 1):
|
| 874 |
x1, y1, x2, y2 = _norm_box_to_pixels(box, img_w, img_h, pad_ratio=0.01)
|
|
@@ -877,6 +930,11 @@ def _process_equation_lines_separately(image, infer_crop_mode=None):
|
|
| 877 |
line_clean = clean_output(line_raw, False).strip()
|
| 878 |
if not line_clean:
|
| 879 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 880 |
line_label = f"Eq {i}"
|
| 881 |
line_markdown = line_clean
|
| 882 |
if "$$" not in line_markdown and "\\[" not in line_markdown and "\\(" not in line_markdown:
|
|
|
|
| 85 |
|
| 86 |
def extract_grounding_references(text):
|
| 87 |
refs = []
|
| 88 |
+
seen = set()
|
| 89 |
for entry in _extract_grounding_entries(text):
|
| 90 |
coord_text = repr(entry["coords"])
|
| 91 |
+
key = (
|
| 92 |
+
entry["label"].strip().lower(),
|
| 93 |
+
tuple(
|
| 94 |
+
(round(c[0], 1), round(c[1], 1), round(c[2], 1), round(c[3], 1))
|
| 95 |
+
for c in entry["coords"]
|
| 96 |
+
),
|
| 97 |
+
)
|
| 98 |
+
if key in seen:
|
| 99 |
+
continue
|
| 100 |
+
seen.add(key)
|
| 101 |
raw = f'<|ref|>{entry["label"]}<|/ref|><|det|>{coord_text}<|/det|>'
|
| 102 |
refs.append((raw, entry["label"], coord_text))
|
| 103 |
return refs
|
|
|
|
| 260 |
text = re.sub(rf'(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?', '', text)
|
| 261 |
|
| 262 |
text = _strip_malformed_grounding(text)
|
| 263 |
+
text = _dedupe_repeated_math_blocks(text)
|
| 264 |
return text.strip()
|
| 265 |
|
| 266 |
def _strip_malformed_grounding(text: str) -> str:
|
|
|
|
| 280 |
text = re.sub(r'<\|/?det\|>', '', text)
|
| 281 |
return text
|
| 282 |
|
| 283 |
+
def _equation_text_key(text: str) -> str:
|
| 284 |
+
if not text:
|
| 285 |
+
return ""
|
| 286 |
+
key = text.strip()
|
| 287 |
+
key = re.sub(r'\\\[(.+?)\\\]', r'\1', key, flags=re.DOTALL)
|
| 288 |
+
key = re.sub(r'\\\((.+?)\\\)', r'\1', key, flags=re.DOTALL)
|
| 289 |
+
key = re.sub(r'\$\$(.+?)\$\$', r'\1', key, flags=re.DOTALL)
|
| 290 |
+
key = re.sub(r'\^\{([A-Za-z0-9])\}', r'^\1', key)
|
| 291 |
+
key = re.sub(r'_\{([A-Za-z0-9])\}', r'_\1', key)
|
| 292 |
+
key = re.sub(r'\s+', '', key)
|
| 293 |
+
return key.lower()
|
| 294 |
+
|
| 295 |
+
def _dedupe_repeated_math_blocks(text: str) -> str:
|
| 296 |
+
if not text:
|
| 297 |
+
return ""
|
| 298 |
+
|
| 299 |
+
pattern = re.compile(r'\\\[(.+?)\\\]|\\\((.+?)\\\)|\$\$(.+?)\$\$', re.DOTALL)
|
| 300 |
+
seen = set()
|
| 301 |
+
out = []
|
| 302 |
+
last = 0
|
| 303 |
+
removed_any = False
|
| 304 |
+
|
| 305 |
+
for m in pattern.finditer(text):
|
| 306 |
+
out.append(text[last:m.start()])
|
| 307 |
+
expr = m.group(1) or m.group(2) or m.group(3) or ""
|
| 308 |
+
key = _equation_text_key(expr)
|
| 309 |
+
if key and key in seen:
|
| 310 |
+
removed_any = True
|
| 311 |
+
else:
|
| 312 |
+
if key:
|
| 313 |
+
seen.add(key)
|
| 314 |
+
out.append(m.group(0))
|
| 315 |
+
last = m.end()
|
| 316 |
+
out.append(text[last:])
|
| 317 |
+
|
| 318 |
+
merged = ''.join(out)
|
| 319 |
+
if removed_any:
|
| 320 |
+
merged = re.sub(r'\n{3,}', '\n\n', merged)
|
| 321 |
+
return merged
|
| 322 |
+
|
| 323 |
PREVIEW_CSS = """
|
| 324 |
<style>
|
| 325 |
.math-preview {
|
|
|
|
| 921 |
raw_parts = [f"## Detection\n\n{detect_raw}".strip()]
|
| 922 |
refs = []
|
| 923 |
crops = []
|
| 924 |
+
seen_line_keys = set()
|
| 925 |
|
| 926 |
for i, box in enumerate(boxes, 1):
|
| 927 |
x1, y1, x2, y2 = _norm_box_to_pixels(box, img_w, img_h, pad_ratio=0.01)
|
|
|
|
| 930 |
line_clean = clean_output(line_raw, False).strip()
|
| 931 |
if not line_clean:
|
| 932 |
continue
|
| 933 |
+
line_key = _equation_text_key(line_clean)
|
| 934 |
+
if line_key and line_key in seen_line_keys:
|
| 935 |
+
continue
|
| 936 |
+
if line_key:
|
| 937 |
+
seen_line_keys.add(line_key)
|
| 938 |
line_label = f"Eq {i}"
|
| 939 |
line_markdown = line_clean
|
| 940 |
if "$$" not in line_markdown and "\\[" not in line_markdown and "\\(" not in line_markdown:
|
tests/test_clean_output.py
CHANGED
|
@@ -10,6 +10,8 @@ def _load_clean_output():
|
|
| 10 |
module = ast.parse(source, filename=str(app_path))
|
| 11 |
|
| 12 |
wanted = {
|
|
|
|
|
|
|
| 13 |
"_strip_malformed_grounding",
|
| 14 |
"clean_output",
|
| 15 |
}
|
|
@@ -49,6 +51,16 @@ class CleanOutputTests(unittest.TestCase):
|
|
| 49 |
self.assertNotIn("<|ref|>", cleaned)
|
| 50 |
self.assertNotIn("<|det|>", cleaned)
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
if __name__ == "__main__":
|
| 54 |
unittest.main()
|
|
|
|
| 10 |
module = ast.parse(source, filename=str(app_path))
|
| 11 |
|
| 12 |
wanted = {
|
| 13 |
+
"_equation_text_key",
|
| 14 |
+
"_dedupe_repeated_math_blocks",
|
| 15 |
"_strip_malformed_grounding",
|
| 16 |
"clean_output",
|
| 17 |
}
|
|
|
|
| 51 |
self.assertNotIn("<|ref|>", cleaned)
|
| 52 |
self.assertNotIn("<|det|>", cleaned)
|
| 53 |
|
| 54 |
+
def test_dedupes_equivalent_math_blocks(self):
|
| 55 |
+
clean_output = _load_clean_output()
|
| 56 |
+
raw = (
|
| 57 |
+
"\\[ \\frac{(18x-27-7)}{(2x-3)^2}=\\frac{18x-34}{(2x-3)^2} \\]\n"
|
| 58 |
+
"\\[ \\frac{(18x-27-7)}{(2x-3)^{2}}=\\frac{18x-34}{(2x-3)^{2}} \\]\n"
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
cleaned = clean_output(raw, include_images=True)
|
| 62 |
+
self.assertEqual(1, cleaned.count("\\["))
|
| 63 |
+
|
| 64 |
|
| 65 |
if __name__ == "__main__":
|
| 66 |
unittest.main()
|