#!/usr/bin/env python3 """Run an oracle-retrieval QA upper bound. The model receives only the gold answer sessions listed in answer_session_ids. This separates answer synthesis errors from retrieval errors. """ from __future__ import annotations import argparse import json import os import random import sys import time from pathlib import Path from typing import Any, Dict, List REPO_ROOT = Path(__file__).resolve().parents[1] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) from openai import OpenAI try: from openai import AzureOpenAI from azure.identity import ( AzureCliCredential, ChainedTokenCredential, ManagedIdentityCredential, get_bearer_token_provider, ) AZURE_OAUTH_SCOPE = os.environ.get("AZURE_OAUTH_SCOPE", "") if AZURE_OAUTH_SCOPE: credential = get_bearer_token_provider( ChainedTokenCredential( AzureCliCredential(), ManagedIdentityCredential(), ), AZURE_OAUTH_SCOPE, ) else: credential = None except ImportError: AzureOpenAI = None credential = None from model_zoo import model_zoo # Azure OpenAI endpoint (set AZURE_OPENAI_ENDPOINT env var to your deployment URL). AZURE_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT", "") # OpenAI-compatible LiteLLM proxy URL (set LITELLM_BASE_URL env var to your proxy). TRITONAI_BASE_URL = os.environ.get("LITELLM_BASE_URL", "") def read_json(path: str | Path) -> Any: with open(path, "r", encoding="utf-8") as f: return json.load(f) def read_existing_qids(path: str | Path) -> set[str]: if not Path(path).exists(): return set() out = set() with open(path, "r", encoding="utf-8") as f: for line in f: if line.strip(): out.add(json.loads(line)["question_id"]) return out def retryable_status(exc: Exception) -> int | None: status = getattr(exc, "status_code", None) or getattr(exc, "http_status", None) if status is not None: return int(status) resp = getattr(exc, "response", None) if resp is not None and getattr(resp, "status_code", None) is not None: return int(resp.status_code) msg = str(exc).lower() if "429" in msg or "rate limit" in msg: return 429 if "500" in msg or "internal server error" in msg: return 500 if "503" in msg or "api configuration unavailable" in msg: return 503 if "504" in msg or "gateway time-out" in msg or "gateway timeout" in msg: return 504 return None def make_client(args, api_version: str): if args.nvidia: return OpenAI(api_key=os.getenv("NV_API_KEY"), base_url="https://inference-api.nvidia.com/v1") if args.tritonai: return OpenAI(api_key=os.getenv("TRITONAI_API_KEY"), base_url=TRITONAI_BASE_URL) if args.vllm: return OpenAI( api_key=os.getenv("VLLM_API_KEY", "EMPTY"), base_url=os.getenv("VLLM_BASE_URL", "http://localhost:8000/v1"), ) if args.debug: return OpenAI(api_key=os.getenv("OPENAI_API_KEY")) if AzureOpenAI is None: raise RuntimeError("AzureOpenAI is not available. Use --nvidia, --tritonai, --vllm, or --debug.") return AzureOpenAI( azure_endpoint=AZURE_ENDPOINT, azure_ad_token_provider=credential, api_version=api_version, ) def llm_call(client, deployment_name: str, prompt: str, use_user_role: bool, max_retries: int = 999): role = "user" if use_user_role else "system" for attempt in range(max_retries): try: return client.chat.completions.create( model=deployment_name, messages=[{"role": role, "content": prompt}], ) except Exception as exc: status = retryable_status(exc) if status in (403, 429, 500, 503, 504): wait = min(120, 30 + attempt * 5) print(f"[WARN] HTTP {status}; sleeping {wait}s then retrying", flush=True) time.sleep(wait) continue raise raise RuntimeError("unreachable") def oracle_session_ids(entry: Dict[str, Any], source: str) -> List[str]: if source == "answer": return entry.get("answer_session_ids", []) if source == "scenario": return entry.get("scenario_session_ids", []) if source in ("haystack_truncate", "haystack_subsample"): return entry.get("haystack_session_ids", []) raise ValueError(f"Unknown oracle source: {source}") def select_haystack_session_ids( entry: Dict[str, Any], all_sessions: Dict[str, List[Dict[str, str]]], source: str, max_haystack_tokens: int, subsample_n: int, ) -> List[str]: """Apply truncate/subsample policy and return the session ids to feed in. Token estimate uses chars/4 as a cheap heuristic (good enough for a budget). """ haystack = entry.get("haystack_session_ids", []) if source == "haystack_truncate": # Walk haystack in original order; stop once estimated tokens exceeds budget. kept: List[str] = [] char_budget = max_haystack_tokens * 4 used_chars = 0 for sid in haystack: turns = all_sessions.get(sid, []) sess_chars = sum(len(t.get("content") or "") for t in turns) if used_chars + sess_chars > char_budget and kept: break kept.append(sid) used_chars += sess_chars return kept if source == "haystack_subsample": rng = random.Random(entry["question_id"]) n = min(subsample_n, len(haystack)) return rng.sample(haystack, n) if n > 0 else [] raise ValueError(f"select_haystack_session_ids called with non-haystack source: {source}") def build_session_prompt( entry: Dict[str, Any], all_sessions: Dict[str, List[Dict[str, str]]], source: str, selected_session_ids: List[str], ) -> str: date_lookup = dict(zip(entry["haystack_session_ids"], entry["haystack_dates"])) session_blocks = [] for sid in selected_session_ids: turns = all_sessions.get(sid, []) session_blocks.append( "Session ID: {sid}\nSession Date: {date}\nSession Content:\n{content}".format( sid=sid, date=date_lookup.get(sid, ""), content=json.dumps( [{"role": x.get("role"), "content": x.get("content")} for x in turns], ensure_ascii=False, ), ) ) if session_blocks: evidence = "\n\n".join(session_blocks) else: evidence = "(No sessions are available for this oracle source.)" if source == "answer": source_desc = "the gold answer-relevant chat history sessions" elif source == "scenario": source_desc = "all chat history sessions from the question scenario" elif source == "haystack_truncate": source_desc = "chat history sessions from the user's haystack (truncated to context window)" elif source == "haystack_subsample": source_desc = "a random subsample of chat history sessions from the user's haystack" else: source_desc = "chat history sessions" return """I will give you {source_desc} between an assistant and a user. Answer the question using only these sessions. If the provided sessions do not contain enough information to answer, say that the information is not available from the provided chat history. Chat history sessions: {evidence} Current Date: {question_date} Question: {question} Answer:""".format( source_desc=source_desc, evidence=evidence, question_date=entry["question_date"], question=entry["question"], ) def usage_dict(completion) -> Dict[str, int]: usage = getattr(completion, "usage", None) if usage is None: return {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0} return { "prompt_tokens": getattr(usage, "prompt_tokens", 0) or 0, "completion_tokens": getattr(usage, "completion_tokens", 0) or 0, "total_tokens": getattr(usage, "total_tokens", 0) or 0, } def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--in_file", required=True) parser.add_argument("--out_file", required=True) parser.add_argument("--model_name", required=True) parser.add_argument("--all_sessions_file", default="dataset/all_sessions.json") parser.add_argument("--oracle_source", choices=["answer", "scenario", "haystack_truncate", "haystack_subsample"], default="answer", help=("answer = gold answer sessions; scenario = scenario sessions; " "haystack_truncate = haystack truncated to --max_haystack_tokens; " "haystack_subsample = random --subsample_n sessions from haystack")) parser.add_argument("--max_haystack_tokens", type=int, default=900_000, help="Token budget for haystack_truncate (chars/4 estimate); default 900K") parser.add_argument("--subsample_n", type=int, default=20, help="N sessions for haystack_subsample (seeded by question_id); default 20") parser.add_argument("--limit", type=int, default=None) parser.add_argument("--debug", action="store_true", default=False) parser.add_argument("--vllm", action="store_true", default=False) parser.add_argument("--tritonai", action="store_true", default=False) parser.add_argument("--nvidia", action="store_true", default=False) args = parser.parse_args() deployment_name, api_version = model_zoo[args.model_name] client = make_client(args, api_version) use_user_role = args.nvidia or args.tritonai entries = read_json(args.in_file) if args.limit is not None: entries = entries[: args.limit] all_sessions = read_json(args.all_sessions_file) existing = read_existing_qids(args.out_file) Path(args.out_file).parent.mkdir(parents=True, exist_ok=True) with open(args.out_file, "a", encoding="utf-8") as out_f: for idx, entry in enumerate(entries): qid = entry["question_id"] if qid in existing: continue start = time.time() if args.oracle_source in ("haystack_truncate", "haystack_subsample"): selected_session_ids = select_haystack_session_ids( entry, all_sessions, args.oracle_source, max_haystack_tokens=args.max_haystack_tokens, subsample_n=args.subsample_n, ) else: selected_session_ids = oracle_session_ids(entry, args.oracle_source) prompt = build_session_prompt( entry, all_sessions, args.oracle_source, selected_session_ids ) completion = llm_call(client, deployment_name, prompt, use_user_role=use_user_role) content = completion.choices[0].message.content if completion.choices else None if content is None: for _ in range(2): completion = llm_call(client, deployment_name, prompt, use_user_role=use_user_role) content = completion.choices[0].message.content if completion.choices else None if content is not None: break answer = (content or "").strip() usage = usage_dict(completion) row = { "q_idx": idx, "question_id": qid, "hypothesis": answer, "oracle_source": args.oracle_source, "oracle_session_ids": selected_session_ids, "n_oracle_sessions": len(selected_session_ids), "n_prompt_tok": usage["prompt_tokens"], "n_completion_tok": usage["completion_tokens"], "token_budget": { "oracle_answer": { "prompt_tokens": usage["prompt_tokens"], "completion_tokens": usage["completion_tokens"], "n_calls": 1, }, "total": { "prompt_tokens": usage["prompt_tokens"], "completion_tokens": usage["completion_tokens"], "n_calls": 1, }, }, "wall_time_sec": time.time() - start, } print(json.dumps(row, ensure_ascii=False), file=out_f, flush=True) print(json.dumps({ "q_idx": idx, "question_id": qid, "n_oracle_sessions": row["n_oracle_sessions"], "wall_time_sec": round(row["wall_time_sec"], 3), }), flush=True) if __name__ == "__main__": main()