Upload folder using huggingface_hub

29dd711 verified 3 days ago

9.4 kB

	"""
	DLM-NL2JSON-4B — Evaluation Script (Simplified)

	Evaluates the model on the provided test set using an OpenAI-compatible API endpoint.
	Measures per-category exact match accuracy and average latency.

	Usage:
	# Against vLLM / TensorRT-LLM served model
	python eval_example.py \
	--data test_data_lite_200.jsonl \
	--base-url http://your-server:8006/v1 \
	--model qwen3_4b_6th_norag \
	--api-key token-abc123 \
	--disable-thinking

	# Against OpenAI API (GPT-4o baseline)
	export OPENAI_API_KEY="sk-..."
	python eval_example.py \
	--data test_data_lite_200.jsonl \
	--model gpt-4o
	"""

	import json, re, time, argparse, os
	from collections import Counter
	from typing import Dict, Any, List

	# ── Prompts ──────────────────────────────────────────────
	# Import from prompts.py (must be in the same directory)
	from prompts import (
	SYS_CSM_DEFAULT,
	SYS_CREDIT_DEFAULT,
	SYS_GIS_DEFAULT,
	SYS_ALP_DEFAULT,
	SYS_CPI_DEFAULT,
	)

	# ── Category → (special_token, system_prompt) ────────────
	TASK_MAP = {
	0: ("<TASK_ALP>", SYS_ALP_DEFAULT), # ALP-A (pattern)
	1: ("<TASK_ALP>", SYS_ALP_DEFAULT), # ALP-B (flow)
	2: ("<TASK_CSM>", SYS_CSM_DEFAULT), # CSM (consumer spending)
	3: ("<TASK_CREDIT>", SYS_CREDIT_DEFAULT), # CREDIT-Income
	4: ("<TASK_CREDIT>", SYS_CREDIT_DEFAULT), # CREDIT-Spending
	5: ("<TASK_CREDIT>", SYS_CREDIT_DEFAULT), # CREDIT-Loan/Default
	6: ("<TASK_CPI>", SYS_CPI_DEFAULT), # CPI (business status)
	9: ("<TASK_GIS>", SYS_GIS_DEFAULT), # GIS-Inflow
	10: ("<TASK_GIS>", SYS_GIS_DEFAULT), # GIS-Outflow
	11: ("<TASK_GIS>", SYS_GIS_DEFAULT), # GIS-Consumption
	}

	CAT_NAMES = {
	0: "ALP-A(ptrn)", 1: "ALP-B(flow)", 2: "CSM",
	3: "CREDIT-Income", 4: "CREDIT-Spending", 5: "CREDIT-Loan",
	6: "CPI", 9: "GIS-Inflow", 10: "GIS-Outflow", 11: "GIS-Consumption",
	}

	# ── Required keys per category (for comparison) ─────────
	REQUIRED_KEYS = {
	0: ["base_ym", "region_nm", "ptrn", "sex_cd", "age_cd", "category"],
	1: ["base_ym", "region_nm", "flow_cd", "sex_cd", "age_cd", "category"],
	2: ["base_ym", "region_nm", "industry_select", "sex_cd", "age_cd", "category"],
	3: ["base_ym", "region_nm", "job_cd", "perc_cd", "sex_cd", "age_cd", "category"],
	4: ["base_ym", "region_nm", "job_cd", "perc_cd", "sex_cd", "age_cd", "category"],
	5: ["base_ym", "region_nm", "job_cd", "perc_cd", "sex_cd", "age_cd", "category"],
	6: ["base_ym", "region_nm", "bzc_cd", "cp_cd", "enp_cd", "category"],
	9: ["region_nm", "base_ym", "region_count", "category"],
	10: ["region_nm", "base_ym", "region_count", "category"],
	11: ["region_nm", "base_ym", "industry_category", "category"],
	}


	# ── Normalization helpers ────────────────────────────────
	def norm_int_list(v):
	if not isinstance(v, list):
	return v
	out = []
	for x in v:
	try:
	out.append(int(float(str(x).strip())))
	except Exception:
	continue
	return sorted(set(out))


	def norm_dict_of_lists(d):
	"""Normalize industry_select or bzc_cd: {str_key: [int, ...]}"""
	if not isinstance(d, dict):
	return d
	return {str(k).upper() if len(str(k)) == 1 and str(k).isalpha() else str(k):
	norm_int_list(arr) if isinstance(arr, list) else arr
	for k, arr in d.items()}


	def normalize(obj: Dict[str, Any], cat: int) -> Dict[str, Any]:
	"""Normalize prediction/gold for fair comparison (summary excluded)."""
	o = dict(obj)
	o.pop("summary", None)

	for k in ["base_ym", "region_count", "category"]:
	if k in o and isinstance(o[k], str):
	try:
	o[k] = int(o[k])
	except ValueError:
	pass

	for k in ["sex_cd", "age_cd", "job_cd", "perc_cd", "ptrn",
	"industry_category", "cp_cd", "enp_cd"]:
	if k in o:
	o[k] = norm_int_list(o[k])

	if "flow_cd" in o and isinstance(o["flow_cd"], list):
	o["flow_cd"] = norm_int_list(o["flow_cd"])

	for k in ["industry_select", "bzc_cd"]:
	if k in o:
	o[k] = norm_dict_of_lists(o[k])

	if "region_count" in o:
	try:
	o["region_count"] = max(1, min(10, int(o["region_count"])))
	except (ValueError, TypeError):
	pass

	return o


	def extract_first_json(text: str):
	start = text.find("{")
	if start == -1:
	return None
	depth = 0
	for i in range(start, len(text)):
	if text[i] == "{":
	depth += 1
	elif text[i] == "}":
	depth -= 1
	if depth == 0:
	return text[start:i + 1]
	return None


	def compare(pred: Dict, gold: Dict, cat: int):
	req = REQUIRED_KEYS.get(cat, [])
	diff = {}
	for k in req:
	if pred.get(k, "<MISSING>") != gold.get(k, "<MISSING>"):
	diff[k] = {"pred": pred.get(k), "gold": gold.get(k)}
	return len(diff) == 0, diff


	# ── Main ─────────────────────────────────────────────────
	def main():
	ap = argparse.ArgumentParser(description="DLM-NL2JSON-4B Evaluation")
	ap.add_argument("--data", required=True, help="Test JSONL file path")
	ap.add_argument("--base-url", default=None, help="OpenAI-compatible base URL")
	ap.add_argument("--model", required=True, help="Model name")
	ap.add_argument("--api-key", default=os.environ.get("OPENAI_API_KEY", ""), help="API key")
	ap.add_argument("--disable-thinking", action="store_true",
	help="Pass chat_template_kwargs to disable Qwen3 thinking mode")
	ap.add_argument("--max-tokens", type=int, default=512)
	ap.add_argument("--per-cat", type=int, default=999, help="Max samples per category")
	args = ap.parse_args()

	import openai
	client = openai.OpenAI(
	base_url=args.base_url or None,
	api_key=args.api_key or "dummy",
	timeout=60.0,
	)

	# Load test data
	with open(args.data, encoding="utf-8") as f:
	raw = [json.loads(line) for line in f]

	# Group by category and sample
	from collections import defaultdict
	by_cat = defaultdict(list)
	for item in raw:
	out = item["output"] if isinstance(item["output"], dict) else json.loads(item["output"])
	cat = out["category"]
	by_cat[cat].append({"input": item["input"], "gold": out})

	samples = []
	for cat in sorted(by_cat):
	items = by_cat[cat][:args.per_cat]
	samples.extend([(cat, ex) for ex in items])

	print(f"[INFO] Evaluating {len(samples)} samples across {len(by_cat)} categories\n")

	# Evaluate
	ok_counts, total_counts = Counter(), Counter()
	latency_sums = Counter()

	for idx, (cat, ex) in enumerate(samples, 1):
	user_in = ex["input"].strip()
	gold_norm = normalize(ex["gold"], cat)

	tag, sys_prompt = TASK_MAP[cat]
	messages = [
	{"role": "system", "content": sys_prompt},
	{"role": "user", "content": f"{tag}\n{user_in}"},
	]

	kwargs = dict(model=args.model, messages=messages,
	max_tokens=args.max_tokens, temperature=0.0)
	if args.disable_thinking:
	kwargs["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}}

	t0 = time.perf_counter()
	try:
	resp = client.chat.completions.create(**kwargs)
	gen = resp.choices[0].message.content
	except Exception as e:
	dt = time.perf_counter() - t0
	total_counts[cat] += 1
	latency_sums[cat] += dt
	print(f"[{idx:04d}] {CAT_NAMES.get(cat, cat)} \| ERROR: {e}")
	continue

	dt = time.perf_counter() - t0
	total_counts[cat] += 1
	latency_sums[cat] += dt

	json_str = extract_first_json(gen) or gen.strip()
	try:
	pred_obj = json.loads(json_str)
	except json.JSONDecodeError:
	print(f"[{idx:04d}] {CAT_NAMES.get(cat, cat)} \| PARSE_FAIL \| {dt:.2f}s")
	continue

	pred_norm = normalize(pred_obj, cat)
	ok, diff = compare(pred_norm, gold_norm, cat)
	if ok:
	ok_counts[cat] += 1

	status = "OK" if ok else f"FAIL {list(diff.keys())}"
	print(f"[{idx:04d}] {CAT_NAMES.get(cat, cat)} \| {status} \| {dt:.2f}s")

	# Summary
	print("\n" + "=" * 50)
	print("EVALUATION SUMMARY")
	print("=" * 50)
	total_ok = total_all = 0
	for c in sorted(total_counts):
	ok = ok_counts[c]
	tot = total_counts[c]
	acc = ok / tot if tot else 0
	avg_lat = latency_sums[c] / tot if tot else 0
	total_ok += ok
	total_all += tot
	print(f" {CAT_NAMES.get(c, c):20s}: {ok:4d}/{tot:4d} acc={acc:.1%} avg={avg_lat:.3f}s")

	overall_acc = total_ok / total_all if total_all else 0
	overall_lat = sum(latency_sums.values()) / total_all if total_all else 0
	print(f" {'OVERALL':20s}: {total_ok:4d}/{total_all:4d} acc={overall_acc:.1%} avg={overall_lat:.3f}s")


	if __name__ == "__main__":
	main()