Spaces:

CiscsoPonce
/

PrimoGreedy-Agent

Running

App Files Files Community

PrimoGreedy-Agent / scripts /evaluators.py

CiscsoPonce

feat: Sprint 9 — Execution & Quality Control

645673f about 1 month ago

raw

history blame contribute delete

9.68 kB

	"""Custom LangSmith evaluators for PrimoGreedy analyst pipeline.

	Evaluator categories:
	1. Hallucination catchers (LLM-as-a-Judge) — catalyst_grounding_score, company_identity_score
	2. Format verifiers (exact-match) — format_score, verdict_validity_score
	3. Math verifier — kelly_math_score

	Each evaluator conforms to the ``langsmith.evaluate()`` protocol:
	def evaluator(run, example) -> EvaluationResult \| dict
	"""

	import os
	import re

	from dotenv import load_dotenv

	load_dotenv()

	VALID_VERDICTS = {"STRONG BUY", "BUY", "WATCH", "AVOID"}

	REQUIRED_HEADERS = [
	"### THE QUANTITATIVE BASE",
	"### THE LYNCH PITCH",
	"### THE MUNGER INVERT",
	"### FINAL VERDICT",
	]


	# ---------------------------------------------------------------------------
	# 1. Hallucination catchers (LLM-as-a-Judge)
	# ---------------------------------------------------------------------------

	def catalyst_grounding_score(run, example) -> dict:
	"""Score whether the Lynch Pitch catalyst is grounded in provided context.

	Uses an LLM-as-a-Judge prompt to compare the analyst's catalyst claim
	against the data that was actually in the prompt. Returns 0 (fabricated)
	to 1 (fully grounded).
	"""
	inputs = run.inputs or {}
	outputs = run.outputs or {}

	context_parts = []
	if inputs.get("financial_data"):
	context_parts.append(str(inputs["financial_data"])[:3000])
	if inputs.get("sec_context"):
	context_parts.append(str(inputs["sec_context"])[:2000])
	if inputs.get("deep_fundamentals"):
	context_parts.append(str(inputs["deep_fundamentals"])[:2000])
	context = "\n".join(context_parts)

	verdict_text = str(outputs.get("final_verdict", ""))

	lynch_match = re.search(
	r"###\sTHE LYNCH PITCH.?\n(.*?)(?=###\|\Z)",
	verdict_text,
	re.DOTALL,
	)
	lynch_pitch = lynch_match.group(1).strip() if lynch_match else verdict_text[:500]

	if not context or not lynch_pitch:
	return {"key": "catalyst_grounding", "score": 0.5, "comment": "Insufficient data"}

	try:
	from langchain_openai import ChatOpenAI

	judge_llm = ChatOpenAI(
	model=os.getenv("EVAL_MODEL", "nvidia/nemotron-3-nano-30b-a3b:free"),
	api_key=os.getenv("OPENROUTER_API_KEY"),
	base_url="https://openrouter.ai/api/v1",
	temperature=0,
	max_tokens=256,
	)

	judge_prompt = (
	"You are a fact-checking judge. Given the CONTEXT the analyst received "
	"and the CATALYST CLAIM it made, determine whether the claim has "
	"grounding in the context.\n\n"
	"Score on a scale from 0.0 (completely fabricated, no evidence in context) "
	"to 1.0 (fully grounded in the data provided).\n\n"
	"Respond with ONLY a JSON object: {\"score\": <float>, \"reason\": \"<short reason>\"}\n\n"
	f"CONTEXT:\n{context[:4000]}\n\n"
	f"CATALYST CLAIM:\n{lynch_pitch[:1000]}"
	)

	response = judge_llm.invoke(judge_prompt)
	import json
	try:
	result = json.loads(response.content)
	score = float(result.get("score", 0.5))
	reason = result.get("reason", "")
	except (json.JSONDecodeError, ValueError):
	score_match = re.search(r"(\d+\.?\d*)", response.content)
	score = float(score_match.group(1)) if score_match else 0.5
	reason = response.content[:200]

	return {"key": "catalyst_grounding", "score": max(0, min(1, score)), "comment": reason}

	except Exception as exc:
	return {"key": "catalyst_grounding", "score": 0.5, "comment": f"Judge error: {exc}"}


	def company_identity_score(run, example) -> dict:
	"""Check whether the LLM correctly identifies the company's business.

	Catches hallucinations like "High Arctic = Arctic drilling" by comparing
	the analyst's description against the actual sector/business from
	financial_data.
	"""
	inputs = run.inputs or {}
	outputs = run.outputs or {}

	financial_data = str(inputs.get("financial_data", ""))
	verdict_text = str(outputs.get("final_verdict", ""))

	if not financial_data or not verdict_text:
	return {"key": "company_identity", "score": 0.5, "comment": "Insufficient data"}

	try:
	from langchain_openai import ChatOpenAI
	import json

	judge_llm = ChatOpenAI(
	model=os.getenv("EVAL_MODEL", "nvidia/nemotron-3-nano-30b-a3b:free"),
	api_key=os.getenv("OPENROUTER_API_KEY"),
	base_url="https://openrouter.ai/api/v1",
	temperature=0,
	max_tokens=256,
	)

	judge_prompt = (
	"You are a fact-checking judge. Compare the FINANCIAL DATA (ground truth) "
	"with the ANALYST REPORT to check if the analyst correctly identifies "
	"what the company actually does.\n\n"
	"Score 0.0 if the analyst describes a completely different business, "
	"0.5 if partially correct, 1.0 if accurate.\n\n"
	"Respond with ONLY: {\"score\": <float>, \"reason\": \"<short reason>\"}\n\n"
	f"FINANCIAL DATA:\n{financial_data[:3000]}\n\n"
	f"ANALYST REPORT:\n{verdict_text[:3000]}"
	)

	response = judge_llm.invoke(judge_prompt)
	try:
	result = json.loads(response.content)
	score = float(result.get("score", 0.5))
	reason = result.get("reason", "")
	except (json.JSONDecodeError, ValueError):
	score_match = re.search(r"(\d+\.?\d*)", response.content)
	score = float(score_match.group(1)) if score_match else 0.5
	reason = response.content[:200]

	return {"key": "company_identity", "score": max(0, min(1, score)), "comment": reason}

	except Exception as exc:
	return {"key": "company_identity", "score": 0.5, "comment": f"Judge error: {exc}"}


	# ---------------------------------------------------------------------------
	# 2. Format verifiers (exact-match, no LLM)
	# ---------------------------------------------------------------------------

	def format_score(run, example) -> dict:
	"""Check structural correctness of the verdict report.

	Validates:
	- All 4 required headers are present
	- No duplicate headers (the double-header bug)
	- Kelly section present for BUY/STRONG BUY verdicts
	"""
	outputs = run.outputs or {}
	verdict_text = str(outputs.get("final_verdict", ""))

	if not verdict_text or "REJECTED" in verdict_text.upper():
	return {"key": "format", "score": 1.0, "comment": "Rejected/empty, N/A"}

	issues = []
	total_checks = 0

	for header in REQUIRED_HEADERS:
	total_checks += 1
	count = verdict_text.count(header)
	if count == 0:
	issues.append(f"Missing: {header}")
	elif count > 1:
	issues.append(f"Duplicated ({count}x): {header}")

	upper = verdict_text.upper()
	is_buy = "STRONG BUY" in upper or ("BUY" in upper and "AVOID" not in upper)

	if is_buy:
	total_checks += 1
	if "POSITION SIZING" not in verdict_text and "Kelly" not in verdict_text:
	issues.append("Missing Kelly section for BUY verdict")

	passed = total_checks - len(issues)
	score = passed / total_checks if total_checks > 0 else 1.0

	return {
	"key": "format",
	"score": score,
	"comment": "; ".join(issues) if issues else "All format checks passed",
	}


	def verdict_validity_score(run, example) -> dict:
	"""Check that the final verdict is one of the 4 valid values."""
	outputs = run.outputs or {}
	verdict_text = str(outputs.get("final_verdict", ""))

	if not verdict_text or "REJECTED" in verdict_text.upper():
	return {"key": "verdict_validity", "score": 1.0, "comment": "Rejected, N/A"}

	found_verdict = None
	upper = verdict_text.upper()

	if "STRONG BUY" in upper:
	found_verdict = "STRONG BUY"
	elif "BUY" in upper:
	found_verdict = "BUY"
	elif "WATCH" in upper:
	found_verdict = "WATCH"
	elif "AVOID" in upper:
	found_verdict = "AVOID"

	if found_verdict and found_verdict in VALID_VERDICTS:
	return {"key": "verdict_validity", "score": 1.0, "comment": f"Valid: {found_verdict}"}

	return {"key": "verdict_validity", "score": 0.0, "comment": f"Invalid/missing verdict"}


	# ---------------------------------------------------------------------------
	# 3. Math verifier
	# ---------------------------------------------------------------------------

	def kelly_math_score(run, example) -> dict:
	"""Verify Kelly position sizing math is within valid bounds.

	Checks that reported allocation is between 1% and 25% for BUY/STRONG BUY.
	"""
	outputs = run.outputs or {}
	verdict_text = str(outputs.get("final_verdict", ""))
	upper = verdict_text.upper()

	is_buy = "STRONG BUY" in upper or ("BUY" in upper and "AVOID" not in upper)
	if not is_buy:
	return {"key": "kelly_math", "score": 1.0, "comment": "Non-buy, N/A"}

	match = re.search(r"allocation:\s*([\d.]+)%", verdict_text)
	if not match:
	return {"key": "kelly_math", "score": 0.5, "comment": "No allocation found in BUY verdict"}

	pct = float(match.group(1))
	if 1.0 <= pct <= 25.0:
	return {"key": "kelly_math", "score": 1.0, "comment": f"{pct}% within [1%, 25%]"}

	return {"key": "kelly_math", "score": 0.0, "comment": f"{pct}% outside valid range [1%, 25%]"}


	ALL_EVALUATORS = [
	catalyst_grounding_score,
	company_identity_score,
	format_score,
	verdict_validity_score,
	kelly_math_score,
	]