Require primary evidence for verified references
Browse files- main.py +36 -0
- src/comparator.py +3 -0
main.py
CHANGED
|
@@ -551,11 +551,14 @@ def validate_entry(entry, workflow, fetchers, comparator):
|
|
| 551 |
if data: result = comparator.compare(entry, data, "scholar")
|
| 552 |
|
| 553 |
if result:
|
|
|
|
|
|
|
| 554 |
results.append(result)
|
| 555 |
|
| 556 |
if results:
|
| 557 |
best = max(results, key=lambda r: r.confidence)
|
| 558 |
_apply_cross_source_conflict_guard(best, results)
|
|
|
|
| 559 |
return best, results
|
| 560 |
|
| 561 |
# No results
|
|
@@ -592,6 +595,39 @@ def _apply_cross_source_conflict_guard(best, results) -> None:
|
|
| 592 |
best.confidence = min(best.confidence, 0.8)
|
| 593 |
|
| 594 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 595 |
|
| 596 |
|
| 597 |
|
|
|
|
| 551 |
if data: result = comparator.compare(entry, data, "scholar")
|
| 552 |
|
| 553 |
if result:
|
| 554 |
+
result.evidence_step = step.name
|
| 555 |
+
result.evidence_url = getattr(data, "url", "") if data else ""
|
| 556 |
results.append(result)
|
| 557 |
|
| 558 |
if results:
|
| 559 |
best = max(results, key=lambda r: r.confidence)
|
| 560 |
_apply_cross_source_conflict_guard(best, results)
|
| 561 |
+
_apply_evidence_guard(best, results)
|
| 562 |
return best, results
|
| 563 |
|
| 564 |
# No results
|
|
|
|
| 595 |
best.confidence = min(best.confidence, 0.8)
|
| 596 |
|
| 597 |
|
| 598 |
+
def _apply_evidence_guard(best, results) -> None:
|
| 599 |
+
"""Require primary evidence or at least two agreeing exact sources."""
|
| 600 |
+
if not best or not getattr(best, "is_match", False):
|
| 601 |
+
return
|
| 602 |
+
|
| 603 |
+
evidence_step = getattr(best, "evidence_step", "")
|
| 604 |
+
if evidence_step in {"arxiv_id", "arxiv_title", "crossref_doi"}:
|
| 605 |
+
return
|
| 606 |
+
|
| 607 |
+
best_year = str(getattr(best, "fetched_year", "") or "").strip()
|
| 608 |
+
agreeing_sources = {getattr(best, "source", "")}
|
| 609 |
+
for result in results:
|
| 610 |
+
if result is best or not getattr(result, "is_match", False):
|
| 611 |
+
continue
|
| 612 |
+
if getattr(result, "title_similarity", 0.0) < 0.95:
|
| 613 |
+
continue
|
| 614 |
+
other_year = str(getattr(result, "fetched_year", "") or "").strip()
|
| 615 |
+
if best_year and other_year == best_year:
|
| 616 |
+
agreeing_sources.add(getattr(result, "source", ""))
|
| 617 |
+
|
| 618 |
+
if len(agreeing_sources) >= 2:
|
| 619 |
+
return
|
| 620 |
+
|
| 621 |
+
issue = (
|
| 622 |
+
"Insufficient evidence: exact match found only in "
|
| 623 |
+
f"{best.source}; needs arXiv/DOI evidence or another agreeing source"
|
| 624 |
+
)
|
| 625 |
+
if issue not in best.issues:
|
| 626 |
+
best.issues.append(issue)
|
| 627 |
+
best.is_match = False
|
| 628 |
+
best.confidence = min(best.confidence, 0.8)
|
| 629 |
+
|
| 630 |
+
|
| 631 |
|
| 632 |
|
| 633 |
|
src/comparator.py
CHANGED
|
@@ -191,6 +191,9 @@ class MetadataComparator:
|
|
| 191 |
# --- Year Comparison ---
|
| 192 |
bib_year = str(bib_entry.year).strip()
|
| 193 |
fetched_year = str(getattr(fetched_data, 'year', '')).strip()
|
|
|
|
|
|
|
|
|
|
| 194 |
year_match = bool(bib_year and fetched_year and bib_year == fetched_year)
|
| 195 |
|
| 196 |
if not bib_year:
|
|
|
|
| 191 |
# --- Year Comparison ---
|
| 192 |
bib_year = str(bib_entry.year).strip()
|
| 193 |
fetched_year = str(getattr(fetched_data, 'year', '')).strip()
|
| 194 |
+
conference_year = str(getattr(fetched_data, 'conference_year', '') or '').strip()
|
| 195 |
+
if source_name.startswith("arxiv") and conference_year and conference_year.isdigit():
|
| 196 |
+
fetched_year = conference_year
|
| 197 |
year_match = bool(bib_year and fetched_year and bib_year == fetched_year)
|
| 198 |
|
| 199 |
if not bib_year:
|