Buckets:
linvest21/shft-artifacts / code /self_healing_finetuning /data_pipeline /source_quality_certifier.py
| from __future__ import annotations | |
| import re | |
| from typing import Any | |
| HIGH_SIGNAL_TERMS = [ | |
| "red flag", | |
| "warning sign", | |
| "case study", | |
| "worked example", | |
| "example", | |
| "checklist", | |
| "rubric", | |
| "decision", | |
| "pass fail", | |
| "pass/fail", | |
| "reject", | |
| "approve", | |
| "because", | |
| "analysis", | |
| "question", | |
| "answer", | |
| "quiz", | |
| "scenario", | |
| "diagnostic", | |
| "fraud", | |
| "accounting quality", | |
| "cash flow", | |
| "earnings quality", | |
| ] | |
| LOW_SIGNAL_BULK_TERMS = [ | |
| "form 10-k", | |
| "form 10-q", | |
| "annual report", | |
| "quarterly report", | |
| "proxy statement", | |
| "company filings", | |
| "search", | |
| "glossary", | |
| ] | |
| ROLE_SIGNAL_TERMS = { | |
| "researcher": [ | |
| "valuation", | |
| "moat", | |
| "roic", | |
| "earnings quality", | |
| "base rate", | |
| "financial statement analysis", | |
| "term structure", | |
| "cash flow coverage", | |
| "issuer analysis", | |
| "capital structure", | |
| "relative value", | |
| ], | |
| "portfolio_manager": [ | |
| "portfolio construction", | |
| "position sizing", | |
| "rebalancing", | |
| "tradeoff", | |
| "risk budget", | |
| "portfolio allocation", | |
| "active management", | |
| "sector allocation", | |
| "carry", | |
| "roll down", | |
| "roll-down", | |
| ], | |
| "risk_manager": [ | |
| "red flag", | |
| "risk control", | |
| "stress test", | |
| "liquidity risk", | |
| "internal control", | |
| "drawdown", | |
| "market risk", | |
| "credit risk", | |
| "stress scenario", | |
| "capital requirement", | |
| "limit breach", | |
| ], | |
| "performance_manager": [ | |
| "attribution", | |
| "benchmark", | |
| "tracking error", | |
| "information ratio", | |
| "risk adjusted", | |
| "performance attribution", | |
| "return attribution", | |
| "selection effect", | |
| "allocation effect", | |
| ], | |
| "client_portfolio_manager": [ | |
| "client explanation", | |
| "suitability", | |
| "risk tolerance", | |
| "client scenario", | |
| "investment objective", | |
| "income objective", | |
| "time horizon", | |
| "liquidity need", | |
| ], | |
| "chief_investment_officer": [ | |
| "investment policy", | |
| "capital market assumption", | |
| "capital market assumptions", | |
| "committee", | |
| "governance", | |
| "policy portfolio", | |
| "strategic allocation", | |
| "asset allocation", | |
| "liability", | |
| ], | |
| } | |
| ASSET_SIGNAL_TERMS = { | |
| "equity": ["equity", "stock", "public company", "earnings", "cash flow", "valuation"], | |
| "fixed_income": [ | |
| "fixed income", | |
| "fixed-income", | |
| "bond", | |
| "bonds", | |
| "treasury", | |
| "treasuries", | |
| "municipal", | |
| "sovereign", | |
| "coupon", | |
| "yield", | |
| "yield curve", | |
| "curve", | |
| "term structure", | |
| "interest rate", | |
| "interest rates", | |
| "rates", | |
| "monetary policy", | |
| "credit", | |
| "credit risk", | |
| "credit spread", | |
| "spread", | |
| "spreads", | |
| "oas", | |
| "duration", | |
| "convexity", | |
| "dv01", | |
| "cs01", | |
| "default", | |
| "recovery", | |
| "covenant", | |
| "merton", | |
| "vasicek", | |
| "securitized", | |
| "mortgage", | |
| "mbs", | |
| "high yield", | |
| "investment grade", | |
| "liquidity premium", | |
| "carry", | |
| "roll down", | |
| "roll-down", | |
| "frtb", | |
| "irrbb", | |
| "basel", | |
| "alm", | |
| "liability", | |
| ], | |
| "multi_asset": [ | |
| "multi asset", | |
| "multi-asset", | |
| "asset allocation", | |
| "cross asset", | |
| "cross-asset", | |
| "diversification", | |
| "portfolio", | |
| "macro", | |
| "strategic allocation", | |
| "tactical allocation", | |
| "policy portfolio", | |
| "capital market assumption", | |
| "capital market assumptions", | |
| "correlation", | |
| "covariance", | |
| "rebalancing", | |
| "risk parity", | |
| "factor", | |
| "factors", | |
| "alternatives", | |
| "alternative assets", | |
| "real assets", | |
| "inflation", | |
| "real rates", | |
| "regime", | |
| "drawdown", | |
| "currency", | |
| "liquidity", | |
| "allocation effect", | |
| ], | |
| } | |
| def _contains_any(text: str, terms: list[str]) -> list[str]: | |
| found: list[str] = [] | |
| for term in terms: | |
| if term in text: | |
| found.append(term) | |
| return found | |
| def _quality_requires_critical_reasoning(quality_errors: list[str] | None) -> bool: | |
| haystack = " ".join(quality_errors or []).lower() | |
| return "critical_pass" in haystack or "critical pass" in haystack | |
| def certify_source_candidate( | |
| *, | |
| asset_class: str, | |
| role: str, | |
| title: str, | |
| url: str, | |
| source_type: str, | |
| rationale: str, | |
| quality_errors: list[str] | None = None, | |
| policy: dict[str, Any] | None = None, | |
| ) -> dict[str, Any]: | |
| """Pre-download source-quality certification. | |
| This is an intentionally transparent local classifier. It does not certify | |
| legal rights. It certifies whether the candidate looks likely to improve | |
| the currently failing model-quality gate before SHFT spends time | |
| downloading and promoting it. | |
| """ | |
| cfg = (policy or {}).get("source_quality_certification", {}) | |
| require_critical = bool(cfg.get("require_critical_reasoning_when_gate_fails", True)) and _quality_requires_critical_reasoning( | |
| quality_errors | |
| ) | |
| text = f"{title} {url} {rationale}".lower().replace("%20", " ") | |
| text = re.sub(r"[_-]+", " ", text) | |
| high_hits = _contains_any(text, HIGH_SIGNAL_TERMS) | |
| role_hits = _contains_any(text, ROLE_SIGNAL_TERMS.get(role, [role.replace("_", " ")])) | |
| asset_hits = _contains_any(text, ASSET_SIGNAL_TERMS.get(asset_class, [asset_class.replace("_", " ")])) | |
| low_hits = _contains_any(text, LOW_SIGNAL_BULK_TERMS) | |
| score = 0.0 | |
| score += min(len(high_hits), 5) * 1.5 | |
| score += min(len(role_hits), 4) * 1.0 | |
| score += min(len(asset_hits), 4) * 0.75 | |
| if source_type in {"jsonl", "hf_finetune_jsonl"}: | |
| score += 4.0 | |
| if source_type in {"pdf", "html", "txt", "md"}: | |
| score += 0.5 | |
| if low_hits: | |
| score -= min(len(low_hits), 4) * 1.25 | |
| blockers: list[str] = [] | |
| if require_critical and not any(term in high_hits for term in ["red flag", "warning sign", "case study", "worked example", "checklist", "rubric", "decision", "pass fail", "pass/fail", "reject"]): | |
| blockers.append("missing explicit critical pass/fail or red-flag reasoning signal") | |
| if not asset_hits: | |
| blockers.append(f"missing asset-class signal for {asset_class}") | |
| if not role_hits and role not in {"researcher"}: | |
| blockers.append(f"missing role-specific signal for {role}") | |
| if low_hits and score < float(cfg.get("min_training_score", 4.0)): | |
| blockers.append("looks like low-signal bulk/domain prose rather than reasoning supervision") | |
| min_training_score = float(cfg.get("min_training_score", 4.0)) | |
| min_verification_score = float(cfg.get("min_verification_score", 2.0)) | |
| training_eligible = score >= min_training_score and not blockers | |
| verification_eligible = not training_eligible and score >= min_verification_score and bool(asset_hits) | |
| intended_use = "training" if training_eligible else "verification" if verification_eligible else "reject" | |
| if training_eligible: | |
| rationale_text = "candidate appears likely to improve failing model-quality gate with dense role/asset reasoning signal" | |
| elif verification_eligible: | |
| rationale_text = "candidate may help verification coverage but is not dense enough for training" | |
| else: | |
| rationale_text = "candidate is not expected to improve fine-tuning enough to justify download/promotion" | |
| return { | |
| "schema_version": "source_ai_certification_v1", | |
| "method": "transparent_local_ai_source_quality_classifier_v1", | |
| "intended_use": intended_use, | |
| "training_eligible": training_eligible, | |
| "verification_eligible": verification_eligible, | |
| "score": round(score, 4), | |
| "min_training_score": min_training_score, | |
| "min_verification_score": min_verification_score, | |
| "require_critical_reasoning": require_critical, | |
| "matched_terms": { | |
| "high_signal": high_hits, | |
| "asset": asset_hits, | |
| "role": role_hits, | |
| "low_signal": low_hits, | |
| }, | |
| "blockers": blockers, | |
| "rationale": rationale_text, | |
| } | |
| def certify_normalized_source_content( | |
| *, | |
| asset_class: str, | |
| role: str, | |
| title: str, | |
| url: str, | |
| source_type: str, | |
| text: str, | |
| quality_errors: list[str] | None = None, | |
| policy: dict[str, Any] | None = None, | |
| ) -> dict[str, Any]: | |
| """Certify extracted source text after download/normalization. | |
| URL/title certification is only an expected-value screen. This second pass | |
| verifies the actual text that would enter the corpus, so scraped glossary, | |
| navigation, or raw filing prose cannot become training data merely because | |
| the source looked useful before download. | |
| """ | |
| text = text or "" | |
| lower = text.lower() | |
| words = re.findall(r"[a-z0-9]+", lower) | |
| word_count = len(words) | |
| sentence_count = max(1, len(re.findall(r"[.!?]\s+", text)) + 1) | |
| high_hits = _contains_any(lower, HIGH_SIGNAL_TERMS) | |
| low_hits = _contains_any(lower, LOW_SIGNAL_BULK_TERMS) | |
| role_hits = _contains_any(lower, ROLE_SIGNAL_TERMS.get(role, [])) | |
| asset_hits = _contains_any(lower, ASSET_SIGNAL_TERMS.get(asset_class, [])) | |
| because_count = len(re.findall(r"\bbecause\b|\btherefore\b|\bso that\b|\bwhy\b", lower)) | |
| decision_count = len(re.findall(r"\b(pass|fail|reject|approve|avoid|flag|warning|red flag|watchlist)\b", lower)) | |
| reasoning_density = round((because_count + decision_count + len(high_hits)) / max(1, sentence_count), 6) | |
| cfg = (policy or {}).get("source_quality_certification", {}) | |
| min_training_score = float(cfg.get("min_training_score", 4.0)) | |
| min_verification_score = float(cfg.get("min_verification_score", 2.0)) | |
| require_critical = bool(cfg.get("require_critical_reasoning_when_gate_fails", True)) and _quality_requires_critical_reasoning( | |
| quality_errors | |
| ) | |
| min_reasoning_density = float(cfg.get("min_content_reasoning_density", 0.025)) | |
| min_content_words = int(cfg.get("min_content_words", 120)) | |
| score = 0.0 | |
| score += min(4.0, len(high_hits) * 0.8) | |
| score += min(2.0, len(role_hits) * 0.5) | |
| score += min(2.0, len(asset_hits) * 0.5) | |
| score += min(2.0, reasoning_density * 20.0) | |
| score -= min(2.0, len(low_hits) * 0.25) | |
| score = round(max(0.0, score), 4) | |
| blockers: list[str] = [] | |
| if word_count < min_content_words: | |
| blockers.append(f"content_words_below_minimum:{word_count}<{min_content_words}") | |
| if require_critical and decision_count == 0: | |
| blockers.append("content_missing_explicit_pass_fail_or_red_flag_decision_terms") | |
| if require_critical and because_count == 0: | |
| blockers.append("content_missing_because_or_therefore_reasoning_terms") | |
| if require_critical and reasoning_density < min_reasoning_density: | |
| blockers.append(f"content_reasoning_density_below_minimum:{reasoning_density}<{min_reasoning_density}") | |
| if not asset_hits: | |
| blockers.append(f"content_missing_asset_class_signal:{asset_class}") | |
| if not role_hits: | |
| blockers.append(f"content_missing_role_signal:{role}") | |
| if low_hits and not high_hits: | |
| blockers.append("content_looks_like_low_signal_bulk_or_glossary_text") | |
| training_eligible = score >= min_training_score and not blockers | |
| verification_eligible = not training_eligible and score >= min_verification_score and bool(asset_hits) | |
| intended_use = "training" if training_eligible else "verification" if verification_eligible else "reject" | |
| rationale = ( | |
| f"Post-download content certification scored {score:.2f}; " | |
| f"reasoning_density={reasoning_density:.4f}, decision_terms={decision_count}, because_terms={because_count}." | |
| ) | |
| if blockers: | |
| rationale += " Blockers: " + "; ".join(blockers) | |
| return { | |
| "schema_version": "source_content_ai_certification_v1", | |
| "method": "transparent_local_ai_source_content_quality_classifier_v1", | |
| "intended_use": intended_use, | |
| "training_eligible": training_eligible, | |
| "verification_eligible": verification_eligible, | |
| "score": score, | |
| "min_training_score": min_training_score, | |
| "min_verification_score": min_verification_score, | |
| "require_critical_reasoning": require_critical, | |
| "metrics": { | |
| "word_count": word_count, | |
| "sentence_count": sentence_count, | |
| "because_terms": because_count, | |
| "decision_terms": decision_count, | |
| "reasoning_density": reasoning_density, | |
| }, | |
| "matched_terms": { | |
| "high_signal": high_hits, | |
| "low_signal_bulk": low_hits, | |
| "role": role_hits, | |
| "asset": asset_hits, | |
| }, | |
| "blockers": blockers, | |
| "title": title, | |
| "url": url, | |
| "source_type": source_type, | |
| "rationale": rationale, | |
| } | |
Xet Storage Details
- Size:
- 13.2 kB
- Xet hash:
- e1ba0041829d1ef1543c311428048e7aeb942900f1b0851ad01c71e6fa7a2753
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.