ricklon commited on
Commit
7e8815a
·
1 Parent(s): 2c2efb5

Dedupe repeated equation blocks in region OCR output

Browse files
Files changed (2) hide show
  1. app.py +58 -0
  2. tests/test_clean_output.py +12 -0
app.py CHANGED
@@ -85,8 +85,19 @@ TASK_PROMPTS = {
85
 
86
  def extract_grounding_references(text):
87
  refs = []
 
88
  for entry in _extract_grounding_entries(text):
89
  coord_text = repr(entry["coords"])
 
 
 
 
 
 
 
 
 
 
90
  raw = f'<|ref|>{entry["label"]}<|/ref|><|det|>{coord_text}<|/det|>'
91
  refs.append((raw, entry["label"], coord_text))
92
  return refs
@@ -249,6 +260,7 @@ def clean_output(text, include_images=False):
249
  text = re.sub(rf'(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?', '', text)
250
 
251
  text = _strip_malformed_grounding(text)
 
252
  return text.strip()
253
 
254
  def _strip_malformed_grounding(text: str) -> str:
@@ -268,6 +280,46 @@ def _strip_malformed_grounding(text: str) -> str:
268
  text = re.sub(r'<\|/?det\|>', '', text)
269
  return text
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  PREVIEW_CSS = """
272
  <style>
273
  .math-preview {
@@ -869,6 +921,7 @@ def _process_equation_lines_separately(image, infer_crop_mode=None):
869
  raw_parts = [f"## Detection\n\n{detect_raw}".strip()]
870
  refs = []
871
  crops = []
 
872
 
873
  for i, box in enumerate(boxes, 1):
874
  x1, y1, x2, y2 = _norm_box_to_pixels(box, img_w, img_h, pad_ratio=0.01)
@@ -877,6 +930,11 @@ def _process_equation_lines_separately(image, infer_crop_mode=None):
877
  line_clean = clean_output(line_raw, False).strip()
878
  if not line_clean:
879
  continue
 
 
 
 
 
880
  line_label = f"Eq {i}"
881
  line_markdown = line_clean
882
  if "$$" not in line_markdown and "\\[" not in line_markdown and "\\(" not in line_markdown:
 
85
 
86
  def extract_grounding_references(text):
87
  refs = []
88
+ seen = set()
89
  for entry in _extract_grounding_entries(text):
90
  coord_text = repr(entry["coords"])
91
+ key = (
92
+ entry["label"].strip().lower(),
93
+ tuple(
94
+ (round(c[0], 1), round(c[1], 1), round(c[2], 1), round(c[3], 1))
95
+ for c in entry["coords"]
96
+ ),
97
+ )
98
+ if key in seen:
99
+ continue
100
+ seen.add(key)
101
  raw = f'<|ref|>{entry["label"]}<|/ref|><|det|>{coord_text}<|/det|>'
102
  refs.append((raw, entry["label"], coord_text))
103
  return refs
 
260
  text = re.sub(rf'(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?', '', text)
261
 
262
  text = _strip_malformed_grounding(text)
263
+ text = _dedupe_repeated_math_blocks(text)
264
  return text.strip()
265
 
266
  def _strip_malformed_grounding(text: str) -> str:
 
280
  text = re.sub(r'<\|/?det\|>', '', text)
281
  return text
282
 
283
+ def _equation_text_key(text: str) -> str:
284
+ if not text:
285
+ return ""
286
+ key = text.strip()
287
+ key = re.sub(r'\\\[(.+?)\\\]', r'\1', key, flags=re.DOTALL)
288
+ key = re.sub(r'\\\((.+?)\\\)', r'\1', key, flags=re.DOTALL)
289
+ key = re.sub(r'\$\$(.+?)\$\$', r'\1', key, flags=re.DOTALL)
290
+ key = re.sub(r'\^\{([A-Za-z0-9])\}', r'^\1', key)
291
+ key = re.sub(r'_\{([A-Za-z0-9])\}', r'_\1', key)
292
+ key = re.sub(r'\s+', '', key)
293
+ return key.lower()
294
+
295
+ def _dedupe_repeated_math_blocks(text: str) -> str:
296
+ if not text:
297
+ return ""
298
+
299
+ pattern = re.compile(r'\\\[(.+?)\\\]|\\\((.+?)\\\)|\$\$(.+?)\$\$', re.DOTALL)
300
+ seen = set()
301
+ out = []
302
+ last = 0
303
+ removed_any = False
304
+
305
+ for m in pattern.finditer(text):
306
+ out.append(text[last:m.start()])
307
+ expr = m.group(1) or m.group(2) or m.group(3) or ""
308
+ key = _equation_text_key(expr)
309
+ if key and key in seen:
310
+ removed_any = True
311
+ else:
312
+ if key:
313
+ seen.add(key)
314
+ out.append(m.group(0))
315
+ last = m.end()
316
+ out.append(text[last:])
317
+
318
+ merged = ''.join(out)
319
+ if removed_any:
320
+ merged = re.sub(r'\n{3,}', '\n\n', merged)
321
+ return merged
322
+
323
  PREVIEW_CSS = """
324
  <style>
325
  .math-preview {
 
921
  raw_parts = [f"## Detection\n\n{detect_raw}".strip()]
922
  refs = []
923
  crops = []
924
+ seen_line_keys = set()
925
 
926
  for i, box in enumerate(boxes, 1):
927
  x1, y1, x2, y2 = _norm_box_to_pixels(box, img_w, img_h, pad_ratio=0.01)
 
930
  line_clean = clean_output(line_raw, False).strip()
931
  if not line_clean:
932
  continue
933
+ line_key = _equation_text_key(line_clean)
934
+ if line_key and line_key in seen_line_keys:
935
+ continue
936
+ if line_key:
937
+ seen_line_keys.add(line_key)
938
  line_label = f"Eq {i}"
939
  line_markdown = line_clean
940
  if "$$" not in line_markdown and "\\[" not in line_markdown and "\\(" not in line_markdown:
tests/test_clean_output.py CHANGED
@@ -10,6 +10,8 @@ def _load_clean_output():
10
  module = ast.parse(source, filename=str(app_path))
11
 
12
  wanted = {
 
 
13
  "_strip_malformed_grounding",
14
  "clean_output",
15
  }
@@ -49,6 +51,16 @@ class CleanOutputTests(unittest.TestCase):
49
  self.assertNotIn("<|ref|>", cleaned)
50
  self.assertNotIn("<|det|>", cleaned)
51
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  if __name__ == "__main__":
54
  unittest.main()
 
10
  module = ast.parse(source, filename=str(app_path))
11
 
12
  wanted = {
13
+ "_equation_text_key",
14
+ "_dedupe_repeated_math_blocks",
15
  "_strip_malformed_grounding",
16
  "clean_output",
17
  }
 
51
  self.assertNotIn("<|ref|>", cleaned)
52
  self.assertNotIn("<|det|>", cleaned)
53
 
54
+ def test_dedupes_equivalent_math_blocks(self):
55
+ clean_output = _load_clean_output()
56
+ raw = (
57
+ "\\[ \\frac{(18x-27-7)}{(2x-3)^2}=\\frac{18x-34}{(2x-3)^2} \\]\n"
58
+ "\\[ \\frac{(18x-27-7)}{(2x-3)^{2}}=\\frac{18x-34}{(2x-3)^{2}} \\]\n"
59
+ )
60
+
61
+ cleaned = clean_output(raw, include_images=True)
62
+ self.assertEqual(1, cleaned.count("\\["))
63
+
64
 
65
  if __name__ == "__main__":
66
  unittest.main()