Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /distill.py

bbkdevops

about 1 month ago

download

raw

8.72 kB

	"""
	Phase 1B: Distill QA pairs from local Ollama models.
	Strategy: ส่ง context/topic → ให้โมเดลสร้าง Q+A → เก็บคู่คุณภาพสูง
	Models: ใช้ทุกโมเดลที่มีใน Ollama แล้ว ensemble คำตอบ
	"""

	import json
	import re
	import time
	from pathlib import Path

	import requests
	from tqdm import tqdm

	OLLAMA_URL = "http://localhost:11434"
	DISTILLED_DIR = Path(__file__).parent / "distilled"
	DISTILLED_DIR.mkdir(exist_ok=True)

	# ─── Seed Topics ──────────────────────────────────────────────────────────────
	# หัวข้อเริ่มต้นสำหรับสร้าง QA ทั้ง Thai และ English

	THAI_TOPICS = [
	"ประวัติศาสตร์ไทย", "วิทยาศาสตร์พื้นฐาน", "คณิตศาสตร์", "ภาษาไทย",
	"ภูมิศาสตร์ไทย", "สุขภาพและการแพทย์", "เทคโนโลยีและ AI", "เศรษฐกิจไทย",
	"วัฒนธรรมไทย", "กฎหมายไทยพื้นฐาน", "ดาราศาสตร์", "ชีววิทยา",
	"ฟิสิกส์", "เคมี", "ปรัชญา", "จิตวิทยา",
	]

	EN_TOPICS = [
	"world history", "mathematics", "physics", "chemistry", "biology",
	"computer science", "philosophy", "economics", "psychology", "astronomy",
	"literature", "geography", "medicine", "law basics", "artificial intelligence",
	"logic and reasoning", "ethics", "linguistics",
	]


	# ─── Ollama Helpers ───────────────────────────────────────────────────────────

	def list_models() -> list[str]:
	try:
	r = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5)
	return [m["name"] for m in r.json().get("models", [])]
	except Exception:
	return []


	def generate(model: str, prompt: str, temperature: float = 0.7) -> str:
	try:
	r = requests.post(
	f"{OLLAMA_URL}/api/generate",
	json={"model": model, "prompt": prompt, "stream": False,
	"options": {"temperature": temperature, "num_predict": 512}},
	timeout=120,
	)
	return r.json().get("response", "").strip()
	except Exception:
	return ""


	# ─── QA Generation Prompts ────────────────────────────────────────────────────

	def make_thai_prompt(topic: str) -> str:
	return f"""สร้างคำถามและคำตอบ 5 ข้อเกี่ยวกับหัวข้อ: {topic}

	รูปแบบที่ต้องการ (JSON array):
	[
	{{"question": "คำถาม", "answer": "คำตอบที่ถูกต้องและครบถ้วน"}},
	...
	]

	กฎ:
	- คำตอบต้องถูกต้อง 100%
	- คำตอบต้องกระชับแต่สมบูรณ์
	- ห้ามแต่งเรื่อง
	- ตอบเป็น JSON เท่านั้น ไม่มีข้อความอื่น"""


	def make_en_prompt(topic: str) -> str:
	return f"""Generate 5 high-quality Q&A pairs about: {topic}

	Required format (JSON array):
	[
	{{"question": "...", "answer": "..."}},
	...
	]

	Rules:
	- Answers must be 100% factually correct
	- Answers must be concise but complete
	- No hallucination
	- Return JSON only, no other text"""


	def make_reasoning_prompt(lang: str) -> str:
	if lang == "th":
	return """สร้างคำถามเชิงตรรกะ/คณิตศาสตร์ 3 ข้อพร้อมวิธีคิดและคำตอบ

	รูปแบบ JSON:
	[
	{{"question": "โจทย์", "reasoning": "วิธีคิดทีละขั้น", "answer": "คำตอบสุดท้าย"}},
	...
	]
	ตอบเป็น JSON เท่านั้น"""
	else:
	return """Generate 3 logic/math reasoning Q&A pairs with step-by-step solutions.

	JSON format:
	[
	{"question": "problem", "reasoning": "step by step", "answer": "final answer"},
	...
	]
	Return JSON only."""


	# ─── Parse & Validate ─────────────────────────────────────────────────────────

	def extract_json(text: str) -> list[dict]:
	text = text.strip()
	# หา JSON array ใน text
	match = re.search(r'\[[\s\S]*\]', text)
	if not match:
	return []
	try:
	data = json.loads(match.group())
	if not isinstance(data, list):
	return []
	valid = []
	for item in data:
	q = item.get("question", "").strip()
	a = item.get("answer", "").strip()
	r = item.get("reasoning", "").strip()
	if q and a and len(q) > 5 and len(a) > 5:
	valid.append({"question": q, "answer": a, "reasoning": r})
	return valid
	except Exception:
	return []


	# ─── Multi-Model Ensemble ─────────────────────────────────────────────────────

	def ensemble_generate(models: list[str], prompt: str, lang: str, topic: str) -> list[dict]:
	"""ให้หลายโมเดล generate แล้วรวมผล — เพิ่มความหลากหลายและคุณภาพ"""
	results = []
	for model in models:
	raw = generate(model, prompt)
	pairs = extract_json(raw)
	for p in pairs:
	p["lang"] = lang
	p["source"] = f"distill_{model}"
	p["topic"] = topic
	p["context"] = ""
	results.extend(pairs)
	return results


	# ─── Main Distillation ────────────────────────────────────────────────────────

	def distill(n_rounds: int = 3):
	models = list_models()
	if not models:
	print("ERROR: ไม่พบ Ollama models — รัน `ollama serve` ก่อน")
	return

	print(f"Found {len(models)} models: {models}")
	print(f"Starting distillation — {n_rounds} rounds per topic\n")

	all_pairs = []
	out_path = DISTILLED_DIR / "distilled_qa.jsonl"

	with open(out_path, "w", encoding="utf-8") as f:

	# Thai topics
	for topic in tqdm(THAI_TOPICS, desc="Thai topics"):
	for _ in range(n_rounds):
	prompt = make_thai_prompt(topic)
	pairs = ensemble_generate(models, prompt, "th", topic)
	for p in pairs:
	f.write(json.dumps(p, ensure_ascii=False) + "\n")
	all_pairs.extend(pairs)
	time.sleep(0.1)

	# English topics
	for topic in tqdm(EN_TOPICS, desc="EN topics"):
	for _ in range(n_rounds):
	prompt = make_en_prompt(topic)
	pairs = ensemble_generate(models, prompt, "en", topic)
	for p in pairs:
	f.write(json.dumps(p, ensure_ascii=False) + "\n")
	all_pairs.extend(pairs)
	time.sleep(0.1)

	# Reasoning pairs (both langs)
	print("\nGenerating reasoning pairs ...")
	for lang in ["th", "en"]:
	for _ in tqdm(range(20), desc=f"Reasoning {lang}"):
	prompt = make_reasoning_prompt(lang)
	for model in models[:2]: # top 2 models for reasoning
	raw = generate(model, prompt, temperature=0.3)
	pairs = extract_json(raw)
	for p in pairs:
	p["lang"] = lang
	p["source"] = f"distill_reasoning_{model}"
	p["topic"] = "reasoning"
	p["context"] = ""
	f.write(json.dumps(p, ensure_ascii=False) + "\n")
	all_pairs.extend(pairs)

	th = sum(1 for d in all_pairs if d["lang"] == "th")
	en = sum(1 for d in all_pairs if d["lang"] == "en")
	print(f"\nDistilled: {len(all_pairs):,} pairs (Thai: {th:,} \| EN: {en:,})")
	print(f"Saved → {out_path}")


	if __name__ == "__main__":
	distill(n_rounds=3)

Xet Storage Details

Size:: 8.72 kB
Xet hash:: 7bfb1f455d96110ea940fef97fcf60b0cf5fdbb95da0ef92399f3fc1dd0d074f

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.