File size: 25,047 Bytes
11a28db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838da49
11a28db
 
 
 
 
838da49
11a28db
 
838da49
11a28db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838da49
11a28db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838da49
11a28db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838da49
 
 
 
11a28db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838da49
 
 
11a28db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838da49
 
 
 
 
 
 
 
 
 
 
 
 
 
11a28db
 
 
 
 
 
 
 
838da49
 
 
11a28db
 
838da49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11a28db
 
 
 
 
 
 
 
 
 
838da49
 
 
11a28db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838da49
11a28db
 
 
 
 
 
838da49
 
 
 
 
 
 
 
 
 
 
11a28db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3caaa3
 
11a28db
 
 
 
838da49
a3caaa3
11a28db
 
 
 
 
 
838da49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3caaa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11a28db
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
#!/usr/bin/env python3
"""
BibGuard - Citation Hallucination Detector

Validates bibliography entries against multiple academic data sources:
arXiv, CrossRef, DBLP, Semantic Scholar, OpenAlex, and Google Scholar

Usage:
    python main.py --bib references.bib
    python main.py --bib references.bib --output report.md
"""
import argparse
import sys
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass, field
from typing import List, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import copy

from src.parser import BibParser
from src.fetcher import (
    ArxivFetcher, CrossRefFetcher, DBLPFetcher,
    SemanticScholarFetcher, OpenAlexFetcher, ScholarFetcher
)
from src.comparator import MetadataComparator, EntryReport, resolve_year, CURRENT_YEAR
from src.sanitizer import BibSanitizer
from src.local_db import LocalConferenceDB
from src.ui import BibUI
from src.utils import ProgressDisplay, TextNormalizer
@dataclass
class WorkflowStep:
    name: str
    enabled: bool = True
    display_name: str = ""
    priority: int = 0

@dataclass
class WorkflowConfig:
    steps: List[WorkflowStep] = field(default_factory=list)
    def get_enabled_steps(self) -> List[WorkflowStep]:
        return sorted([s for s in self.steps if s.enabled], key=lambda x: x.priority)

def get_default_workflow() -> WorkflowConfig:
    return WorkflowConfig(steps=[
        WorkflowStep("arxiv_id", True, "arXiv by ID", 0),
        WorkflowStep("crossref_doi", True, "CrossRef by DOI", 1),
        WorkflowStep("semantic_scholar", True, "Semantic Scholar", 2),
        WorkflowStep("dblp", True, "DBLP", 3),
        WorkflowStep("openalex", True, "OpenAlex", 4),
        WorkflowStep("arxiv_title", True, "arXiv by Title", 5),
        WorkflowStep("crossref_title", True, "CrossRef by Title", 6),
        WorkflowStep("google_scholar", False, "Google Scholar", 7),
    ])

def main():
    parser = argparse.ArgumentParser(
        description="BibGuard: Citation Fixer & Validator",
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    
    parser.add_argument("--bib", "-b", required=True, help="Path to .bib file")
    parser.add_argument("--output", "-o", help="Output report path (optional)")
    
    args = parser.parse_args()
    
    bib_path = Path(args.bib)
    if not bib_path.exists():
        print(f"Error: Bib file not found: {args.bib}")
        sys.exit(1)
    
    workflow = get_default_workflow()
    
    try:
        run_fix_and_verify(bib_path, workflow)
    except KeyboardInterrupt:
        print("\nCancelled")
        sys.exit(130)


def run_fix_and_verify(bib_path: Path, workflow):
    """Run validation, auto-fix issues, and verify."""
    progress = ProgressDisplay()
    bib_parser = BibParser()
    ui = BibUI()
    
    print(f"πŸ“š BibGuard - Auto-Fix & Verify")
    print(f"   Target: {bib_path}\n")
    
    # --- Pass 1: Validate & Fix ---
    entries = bib_parser.parse_file(str(bib_path))
    if not entries:
        print("No entries found")
        return

    print(f"Found {len(entries)} entries. Running validation and auto-fix...\n")
    
    # Initialize components
    fetchers = {
        'arxiv': ArxivFetcher(),
        'crossref': CrossRefFetcher(),
        'scholar': ScholarFetcher(),
        'semantic': SemanticScholarFetcher(),
        'openalex': OpenAlexFetcher(),
        'dblp': DBLPFetcher(),
    }
    comparator = MetadataComparator()
    sanitizer = BibSanitizer()
    
    fixed_count = 0
    updated_entries = []
    fixed_details = {} # Key: entry_key, Value: list of changes
    removed_details = [] # List of (entry_key, reason)
    manual_review_queue = [] # List of (entry, best_result, candidates)
    
    # --- Phase 0: Sanitize (Offline Checks) ---
    print("🧹 Running formatting sanity checks...")
    sanitize_fixes = sanitizer.sanitize_all(entries)
    ui.show_sanitize_report(sanitize_fixes)
    
    # If sanitization made changes, save immediately so Phase 1 works on clean data
    if sanitize_fixes:
        bib_parser.save_entries(str(bib_path), entries)
        # Merge sanitize fixes into fixed_details for the final report
        for key, fixes in sanitize_fixes.items():
            if key not in fixed_details:
                fixed_details[key] = []
            for fix in fixes:
                fixed_details[key].append(fix.description)
            fixed_count += 1
    
    # Duplicate detection
    dupes = sanitizer.find_duplicates(entries)
    if dupes:
        print(f"\n⚠ Found {len(dupes)} duplicate title(s):")
        for title, keys in dupes.items():
            print(f"  {' / '.join(keys)}")
        print()

    # --- Phase 0.5: Local DB Lookup ---
    local_db = LocalConferenceDB()
    local_db_loaded = local_db.load()
    
    api_needed_entries = entries  # Always verify against live/network sources.
    if local_db_loaded:
        local_matched_count = 0
        for entry in entries:
            official = local_db.lookup(entry.title)
            if official:
                local_matched_count += 1
        
        if local_matched_count > 0:
            print(f"  πŸ“š Local DB matched: {local_matched_count}; still verifying all entries online")
    
    # --- Phase 1: Analysis (API Fetch) ---
    analysis_results = []
    
    with progress.progress_context(len(api_needed_entries), "Analyzing Entries") as prog:
        with ThreadPoolExecutor(max_workers=min(10, max(1, len(api_needed_entries)))) as executor:
            futures = {executor.submit(validate_entry, e, workflow, fetchers, comparator): e for e in api_needed_entries}
            
            for future in as_completed(futures):
                entry = futures[future]
                try:
                    best_result, candidates = future.result()
                    analysis_results.append((entry, best_result, candidates))
                    prog.update(entry.key, "Analyzed", 1)
                except Exception as e:
                    prog.mark_error()
                    prog.update(entry.key, "Failed", 1)
                    # Keep valid entry even if fetch failed
                    analysis_results.append((entry, None, []))

    # --- Phase 2: Meaningful Report ---
    # Categorize results
    to_fix = []
    to_review = []
    to_remove = []
    ok_entries = []
    
    for entry, best_result, candidates in analysis_results:
        if not best_result: 
            ok_entries.append(entry)
            continue
        
        if best_result.is_match and best_result.fetched_data:
            to_fix.append((entry, best_result, candidates))
        elif candidates:
            to_review.append((entry, best_result, candidates))
        else:
            to_remove.append(entry)
            
    # Visualize Analysis Report
    ui.show_analysis_report(ok_entries, to_fix, to_review, to_remove)
    
    if not (to_fix or to_review or to_remove):
        return

    # --- Phase 3: Apply Fixes ---
    print(f"\nπŸš€ Applying fixes...")
    
    updated_entries = []
    # Add OK entries first (preserve order if we cared, but we sort later usually)
    updated_entries.extend(ok_entries)
    
    # Process Fixes
    for entry, best_result, candidates in to_fix:
        changes = apply_fix(entry, best_result.fetched_data, all_candidates=candidates, allow_optional_updates=True)
        if changes:
             fixed_count += 1
             fixed_details[entry.key] = changes
        updated_entries.append(entry)
        
    # Process Removals
    for entry in to_remove:
        removed_details.append((entry, "No matching metadata found in any source"))
        # Do NOT add to updated_entries
    
    # Process Reviews (Add to queue)
    for item in to_review:
        manual_review_queue.append(item)
        updated_entries.append(item[0]) # Add tentatively, filter later if removed


    # --- Interactive Manual Review ---
    if manual_review_queue:
        print(f"\n\nπŸ” Manual Review Required for {len(manual_review_queue)} entries:")
        
        # Sort by key for consistent order
        manual_review_queue.sort(key=lambda x: x[0].key)
        
        entries_to_remove = set()
        
        for entry, best_res, candidates in manual_review_queue:
            ui.show_manual_review(entry, best_res, candidates, apply_fix)
            
            while True:
                choice = input(f"\nSelect [1-{len(candidates)}], (s)kip, (r)emove, or (q)uit: ").strip().lower()
                
                if choice == 'q':
                    print("Exiting manual review.")
                    # Keep remaining in queue as is (already in updated_entries)
                    break
                elif choice == 's':
                    print("Skipped.")
                    break
                elif choice == 'r':
                    print("Marked for removal.")
                    entries_to_remove.add(entry.key)
                    removed_details.append((entry, "Removed by user during manual review"))
                    break
                elif choice.isdigit():
                    idx = int(choice) - 1
                    if 0 <= idx < len(candidates):
                        selected = candidates[idx]
                        if not _candidate_exact_match(selected):
                            print("Cannot apply: selected candidate is not an exact title/author/year match.")
                            continue
                        changes = apply_fix(entry, selected.fetched_data, allow_optional_updates=True)
                        if changes:
                            fixed_count += 1
                            if entry.key not in fixed_details: fixed_details[entry.key] = []
                            fixed_details[entry.key].extend(changes)
                            print(f"Applied: {', '.join(changes)}")
                        else:
                            print("No changes needed for selected source.")
                        break
                    else:
                        print("Invalid selection.")
                else:
                    print("Invalid input.")
            
            if choice == 'q':
                break
        
        # Filter out removed entries
        if entries_to_remove:
            updated_entries = [e for e in updated_entries if e.key not in entries_to_remove]

    # Overwrite file if changes made
    # Overwrite file if changes made (beyond Phase 0 sanitization)
    has_phase1_changes = any(k not in sanitize_fixes for k in fixed_details) or removed_details
    if has_phase1_changes or fixed_count > len(sanitize_fixes):
        bib_parser.save_entries(str(bib_path), updated_entries)


    # --- Pass 2: Double Check ---
    print("\nπŸ”„ Double checking (Re-validation)...")
    
    entries = bib_parser.parse_file(str(bib_path))
    reports = []
    
    with progress.progress_context(len(entries), "Verifying") as prog:
         with ThreadPoolExecutor(max_workers=min(10, len(entries))) as executor:
            # Note: validate_entry now returns tuple, need to handle
            futures = {executor.submit(validate_entry, e, workflow, fetchers, comparator): e for e in entries}
            
            for future in as_completed(futures):
                entry = futures[future]
                try:
                    best_result, _ = future.result() # Ignore candidates in verify pass
                    reports.append(EntryReport(entry=entry, comparison=best_result))
                    
                    if best_result.is_match:
                        prog.mark_success()
                    else:
                        prog.mark_error()
                    prog.update(entry.key, "Verified", 1)
                except Exception:
                    prog.mark_error()
                    prog.update(entry.key, "Failed", 1)
                
    # Summary
    total = len(entries)
    verified = sum(1 for r in reports if r.comparison and r.comparison.is_match)
    issues = sum(1 for r in reports if r.comparison and r.comparison.has_issues)
    not_found = sum(1 for r in reports if r.comparison and not r.comparison.is_match and not r.comparison.has_issues)
    
    
    # Visual Final Status
    ui.show_final_report(total, verified, issues, not_found, reports, fixed_count, fixed_details, removed_details)
    print("")

def apply_local_fix(entry, official) -> list:
    """
    Apply non-core fixes from local conference DB.
    This never changes title, authors, or year; those fields define the
    reference identity and must be verified against live metadata.
    """
    changes = []
    
    # Entry type upgrade: misc/article β†’ inproceedings if booktitle exists
    if official.booktitle and entry.entry_type.lower() in ('misc', 'article'):
        old_type = entry.entry_type
        entry.entry_type = 'inproceedings'
        if 'ENTRYTYPE' in entry.raw_entry:
            entry.raw_entry['ENTRYTYPE'] = 'inproceedings'
        # Clear journal if it was arXiv
        if entry.journal and 'arxiv' in entry.journal.lower():
            entry.journal = ""
            if 'journal' in entry.raw_entry:
                del entry.raw_entry['journal']
        changes.append(f"Type: @{old_type} β†’ @inproceedings [local_db]")
    
    # Booktitle: adopt from DB if missing or different
    if official.booktitle and not entry.booktitle:
        entry.booktitle = official.booktitle
        entry.raw_entry['booktitle'] = official.booktitle
        changes.append(f"Booktitle: [Added] {official.booktitle[:50]}... [local_db]")
    
    # DOI: adopt if missing
    if official.doi and not entry.doi:
        entry.doi = official.doi
        entry.raw_entry['doi'] = official.doi
        changes.append(f"DOI: [Added] {official.doi} [local_db]")
    
    return changes


def apply_fix(
    entry,
    data,
    all_candidates=None,
    *,
    allow_core_updates: bool = False,
    allow_optional_updates: bool = False,
) -> list:
    """Update only safe metadata by default.

    Core identity fields (title, author, year) are not overwritten unless
    allow_core_updates=True. RefCheck should validate references, not transform
    a nearby candidate into a different citation.
    """
    changes = []
    
    # Helper to clean string
    def clean(s): return str(s).strip() if s else ""
    
    # Title
    new_title = clean(data.title)
    if new_title and new_title.lower() != entry.title.lower():
        if allow_core_updates:
            changes.append(f"Title: {entry.title} -> {new_title}")
            entry.title = new_title
        
    # Year: Use resolve_year() if we have multiple candidates
    if allow_core_updates:
        if all_candidates:
            best_year, year_src = resolve_year(all_candidates, bib_year=entry.year)
            if best_year and best_year != entry.year:
                if int(best_year) > CURRENT_YEAR:
                    changes.append(f"⚠ Skip suspicious future year {best_year} from {year_src}")
                else:
                    changes.append(f"Year: {entry.year} -> {best_year} [{year_src}]")
                    entry.year = best_year
        else:
            # Single candidate fallback
            new_year = clean(getattr(data, 'year', ''))
            if new_year and new_year != entry.year:
                if new_year.isdigit() and int(new_year) > CURRENT_YEAR:
                    changes.append(f"⚠ Skip suspicious future year {new_year}")
                else:
                    changes.append(f"Year: {entry.year} -> {new_year}")
                    entry.year = new_year
        
    # Author: Smart Merge Strategy
    # Check for author initial conflict first
    has_initial_conflict = False
    if all_candidates:
        for cand in all_candidates:
            if hasattr(cand, 'author_initial_conflict') and cand.author_initial_conflict:
                has_initial_conflict = True
                break
    
    if not allow_core_updates:
        pass
    elif has_initial_conflict:
        # Don't overwrite authors when initials conflict
        changes.append(f"⚠ Author initial conflict detected β€” preserving bib authors")
    else:
        # Normal author merge logic
        current_authors_raw = TextNormalizer.parse_author_list(entry.author)
        current_authors_norm = [TextNormalizer.normalize_author_name(a) for a in current_authors_raw]
        
        new_authors_list = getattr(data, 'authors', [])
        if isinstance(new_authors_list, str):
            new_authors_list = TextNormalizer.parse_author_list(new_authors_list)
        
        # Strip DBLP disambiguation IDs from new authors
        new_authors_list = [TextNormalizer.strip_dblp_disambiguation_id(str(a)) for a in new_authors_list]
        
        # Also check if the EXISTING bib authors have DBLP disambiguation IDs baked in
        for raw_auth in current_authors_raw:
            if TextNormalizer.has_dblp_disambiguation_id(raw_auth.strip()):
                changes.append(f"⚠ DBLP disambiguation ID detected in author: '{raw_auth.strip()}'")
            
        final_authors = []
        
        for new_auth in new_authors_list:
            new_auth_str = str(new_auth).strip()
            new_auth_norm = TextNormalizer.normalize_author_name(new_auth_str)
            
            # Try to find a match in the existing list
            match_found = False
            for i, old_norm in enumerate(current_authors_norm):
                if old_norm == new_auth_norm:
                    # Found a match! Use the OLD format
                    final_authors.append(current_authors_raw[i].strip())
                    match_found = True
                    break
            
            if not match_found:
                # New author, use the new string
                final_authors.append(new_auth_str)
                
        # Reconstruct the string
        new_author_str = " and ".join(final_authors)
        
        # Check if the result is effectively different from the original full string
        def simple_norm(s): return s.lower().replace(" ", "").strip()
        
        if simple_norm(new_author_str) != simple_norm(entry.author):
             old_auth = (entry.author[:50] + '...') if len(entry.author) > 50 else entry.author
             new_auth_disp = (new_author_str[:50] + '...') if len(new_author_str) > 50 else new_author_str
             changes.append(f"Author: {old_auth} -> {new_auth_disp}")
             entry.author = new_author_str
         
    # Optional fields (doi, journal, etc.)
    if allow_optional_updates and hasattr(data, 'doi') and data.doi and not entry.doi:
        changes.append(f"DOI: [Added] {data.doi}")
        entry.doi = data.doi

    return changes


def _candidate_exact_match(candidate) -> bool:
    return bool(
        candidate
        and getattr(candidate, "is_match", False)
        and getattr(candidate, "title_match", False)
        and getattr(candidate, "author_match", False)
        and getattr(candidate, "year_match", False)
        and not getattr(candidate, "author_initial_conflict", False)
    )


def validate_entry(entry, workflow, fetchers, comparator):
    """Validate a single entry against configured data sources. Returns (best_result, all_results)."""
    from src.utils import TextNormalizer
    
    results = []
    
    for step in workflow.get_enabled_steps():
        result = None
        data = None
        
        if step.name == "arxiv_id" and entry.has_arxiv:
            data = fetchers['arxiv'].fetch_by_id(entry.arxiv_id)
            if data: result = comparator.compare(entry, data, "arxiv")
        
        elif step.name == "crossref_doi" and entry.doi:
            data = fetchers['crossref'].search_by_doi(entry.doi)
            if data:
                # DOI cross-validation: check if the DOI actually resolves to this paper
                from src.sanitizer import BibSanitizer
                doi_fixes = BibSanitizer().check_doi_title_match(entry, data)
                if doi_fixes:
                    # DOI points to a different work β€” skip this result
                    # The fixes have already cleared the bad DOI from the entry
                    result = None
                else:
                    result = comparator.compare(entry, data, "crossref")
        
        elif step.name == "semantic_scholar" and entry.title:
            data = fetchers['semantic'].fetch_by_doi(entry.doi) if entry.doi else None
            if not data:
                data = fetchers['semantic'].search_by_title(entry.title)
            if data: result = comparator.compare(entry, data, "semantic_scholar")
        
        elif step.name == "dblp" and entry.title:
            data = fetchers['dblp'].search_by_title(entry.title)
            if data: result = comparator.compare(entry, data, "dblp")
        
        elif step.name == "openalex" and entry.title:
            data = fetchers['openalex'].fetch_by_doi(entry.doi) if entry.doi else None
            if not data:
                data = fetchers['openalex'].search_by_title(entry.title)
            if data: result = comparator.compare(entry, data, "openalex")
        
        elif step.name == "arxiv_title" and entry.title:
            metas = fetchers['arxiv'].search_by_title(entry.title)
            if metas:
                norm1 = TextNormalizer.normalize_for_comparison(entry.title)
                best, best_sim = None, 0
                for m in metas:
                    sim = TextNormalizer.similarity_ratio(
                        norm1, TextNormalizer.normalize_for_comparison(m.title)
                    )
                    if sim > best_sim:
                        best, best_sim = m, sim
                if best and best_sim > 0.5:
                    result = comparator.compare(entry, best, "arxiv")
        
        elif step.name == "crossref_title" and entry.title:
            data = fetchers['crossref'].search_by_title(entry.title)
            if data: result = comparator.compare(entry, data, "crossref")
        
        elif step.name == "google_scholar" and entry.title:
            data = fetchers['scholar'].search_by_title(entry.title)
            if data: result = comparator.compare(entry, data, "scholar")
        
        if result:
            result.evidence_step = step.name
            result.evidence_url = getattr(data, "url", "") if data else ""
            results.append(result)
    
    if results:
        best = max(results, key=lambda r: r.confidence)
        _apply_cross_source_conflict_guard(best, results)
        _apply_evidence_guard(best, results)
        return best, results
    
    # No results
    return comparator.create_unable_result(entry, "Not found in any data source"), []


def _apply_cross_source_conflict_guard(best, results) -> None:
    """Reject candidates when exact-title sources disagree on core metadata."""
    if not best or not getattr(best, "fetched_title", ""):
        return

    conflicts = []
    for result in results:
        if result is best:
            continue
        if getattr(result, "title_similarity", 0.0) < 0.95:
            continue

        best_year = str(getattr(best, "fetched_year", "") or "").strip()
        other_year = str(getattr(result, "fetched_year", "") or "").strip()
        if best_year and other_year and best_year != other_year:
            conflicts.append(f"{result.source}={other_year}")

    if not conflicts:
        return

    issue = (
        f"Cross-source year conflict: best {best.source}={best.fetched_year}, "
        f"also found {'; '.join(dict.fromkeys(conflicts))}"
    )
    if issue not in best.issues:
        best.issues.append(issue)
    best.is_match = False
    best.confidence = min(best.confidence, 0.8)


def _apply_evidence_guard(best, results) -> None:
    """Require primary evidence or at least two agreeing exact sources."""
    if not best or not getattr(best, "is_match", False):
        return

    evidence_step = getattr(best, "evidence_step", "")
    if evidence_step in {"arxiv_id", "arxiv_title", "crossref_doi"}:
        return

    best_year = str(getattr(best, "fetched_year", "") or "").strip()
    agreeing_sources = {getattr(best, "source", "")}
    for result in results:
        if result is best or not getattr(result, "is_match", False):
            continue
        if getattr(result, "title_similarity", 0.0) < 0.95:
            continue
        other_year = str(getattr(result, "fetched_year", "") or "").strip()
        if best_year and other_year == best_year:
            agreeing_sources.add(getattr(result, "source", ""))

    if len(agreeing_sources) >= 2:
        return

    issue = (
        "Insufficient evidence: exact match found only in "
        f"{best.source}; needs arXiv/DOI evidence or another agreeing source"
    )
    if issue not in best.issues:
        best.issues.append(issue)
    best.is_match = False
    best.confidence = min(best.confidence, 0.8)





if __name__ == "__main__":
    main()