bbkdevops's picture
download
raw
8.72 kB
"""
Phase 1B: Distill QA pairs from local Ollama models.
Strategy: ส่ง context/topic → ให้โมเดลสร้าง Q+A → เก็บคู่คุณภาพสูง
Models: ใช้ทุกโมเดลที่มีใน Ollama แล้ว ensemble คำตอบ
"""
import json
import re
import time
from pathlib import Path
import requests
from tqdm import tqdm
OLLAMA_URL = "http://localhost:11434"
DISTILLED_DIR = Path(__file__).parent / "distilled"
DISTILLED_DIR.mkdir(exist_ok=True)
# ─── Seed Topics ──────────────────────────────────────────────────────────────
# หัวข้อเริ่มต้นสำหรับสร้าง QA ทั้ง Thai และ English
THAI_TOPICS = [
"ประวัติศาสตร์ไทย", "วิทยาศาสตร์พื้นฐาน", "คณิตศาสตร์", "ภาษาไทย",
"ภูมิศาสตร์ไทย", "สุขภาพและการแพทย์", "เทคโนโลยีและ AI", "เศรษฐกิจไทย",
"วัฒนธรรมไทย", "กฎหมายไทยพื้นฐาน", "ดาราศาสตร์", "ชีววิทยา",
"ฟิสิกส์", "เคมี", "ปรัชญา", "จิตวิทยา",
]
EN_TOPICS = [
"world history", "mathematics", "physics", "chemistry", "biology",
"computer science", "philosophy", "economics", "psychology", "astronomy",
"literature", "geography", "medicine", "law basics", "artificial intelligence",
"logic and reasoning", "ethics", "linguistics",
]
# ─── Ollama Helpers ───────────────────────────────────────────────────────────
def list_models() -> list[str]:
try:
r = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5)
return [m["name"] for m in r.json().get("models", [])]
except Exception:
return []
def generate(model: str, prompt: str, temperature: float = 0.7) -> str:
try:
r = requests.post(
f"{OLLAMA_URL}/api/generate",
json={"model": model, "prompt": prompt, "stream": False,
"options": {"temperature": temperature, "num_predict": 512}},
timeout=120,
)
return r.json().get("response", "").strip()
except Exception:
return ""
# ─── QA Generation Prompts ────────────────────────────────────────────────────
def make_thai_prompt(topic: str) -> str:
return f"""สร้างคำถามและคำตอบ 5 ข้อเกี่ยวกับหัวข้อ: {topic}
รูปแบบที่ต้องการ (JSON array):
[
{{"question": "คำถาม", "answer": "คำตอบที่ถูกต้องและครบถ้วน"}},
...
]
กฎ:
- คำตอบต้องถูกต้อง 100%
- คำตอบต้องกระชับแต่สมบูรณ์
- ห้ามแต่งเรื่อง
- ตอบเป็น JSON เท่านั้น ไม่มีข้อความอื่น"""
def make_en_prompt(topic: str) -> str:
return f"""Generate 5 high-quality Q&A pairs about: {topic}
Required format (JSON array):
[
{{"question": "...", "answer": "..."}},
...
]
Rules:
- Answers must be 100% factually correct
- Answers must be concise but complete
- No hallucination
- Return JSON only, no other text"""
def make_reasoning_prompt(lang: str) -> str:
if lang == "th":
return """สร้างคำถามเชิงตรรกะ/คณิตศาสตร์ 3 ข้อพร้อมวิธีคิดและคำตอบ
รูปแบบ JSON:
[
{{"question": "โจทย์", "reasoning": "วิธีคิดทีละขั้น", "answer": "คำตอบสุดท้าย"}},
...
]
ตอบเป็น JSON เท่านั้น"""
else:
return """Generate 3 logic/math reasoning Q&A pairs with step-by-step solutions.
JSON format:
[
{"question": "problem", "reasoning": "step by step", "answer": "final answer"},
...
]
Return JSON only."""
# ─── Parse & Validate ─────────────────────────────────────────────────────────
def extract_json(text: str) -> list[dict]:
text = text.strip()
# หา JSON array ใน text
match = re.search(r'\[[\s\S]*\]', text)
if not match:
return []
try:
data = json.loads(match.group())
if not isinstance(data, list):
return []
valid = []
for item in data:
q = item.get("question", "").strip()
a = item.get("answer", "").strip()
r = item.get("reasoning", "").strip()
if q and a and len(q) > 5 and len(a) > 5:
valid.append({"question": q, "answer": a, "reasoning": r})
return valid
except Exception:
return []
# ─── Multi-Model Ensemble ─────────────────────────────────────────────────────
def ensemble_generate(models: list[str], prompt: str, lang: str, topic: str) -> list[dict]:
"""ให้หลายโมเดล generate แล้วรวมผล — เพิ่มความหลากหลายและคุณภาพ"""
results = []
for model in models:
raw = generate(model, prompt)
pairs = extract_json(raw)
for p in pairs:
p["lang"] = lang
p["source"] = f"distill_{model}"
p["topic"] = topic
p["context"] = ""
results.extend(pairs)
return results
# ─── Main Distillation ────────────────────────────────────────────────────────
def distill(n_rounds: int = 3):
models = list_models()
if not models:
print("ERROR: ไม่พบ Ollama models — รัน `ollama serve` ก่อน")
return
print(f"Found {len(models)} models: {models}")
print(f"Starting distillation — {n_rounds} rounds per topic\n")
all_pairs = []
out_path = DISTILLED_DIR / "distilled_qa.jsonl"
with open(out_path, "w", encoding="utf-8") as f:
# Thai topics
for topic in tqdm(THAI_TOPICS, desc="Thai topics"):
for _ in range(n_rounds):
prompt = make_thai_prompt(topic)
pairs = ensemble_generate(models, prompt, "th", topic)
for p in pairs:
f.write(json.dumps(p, ensure_ascii=False) + "\n")
all_pairs.extend(pairs)
time.sleep(0.1)
# English topics
for topic in tqdm(EN_TOPICS, desc="EN topics"):
for _ in range(n_rounds):
prompt = make_en_prompt(topic)
pairs = ensemble_generate(models, prompt, "en", topic)
for p in pairs:
f.write(json.dumps(p, ensure_ascii=False) + "\n")
all_pairs.extend(pairs)
time.sleep(0.1)
# Reasoning pairs (both langs)
print("\nGenerating reasoning pairs ...")
for lang in ["th", "en"]:
for _ in tqdm(range(20), desc=f"Reasoning {lang}"):
prompt = make_reasoning_prompt(lang)
for model in models[:2]: # top 2 models for reasoning
raw = generate(model, prompt, temperature=0.3)
pairs = extract_json(raw)
for p in pairs:
p["lang"] = lang
p["source"] = f"distill_reasoning_{model}"
p["topic"] = "reasoning"
p["context"] = ""
f.write(json.dumps(p, ensure_ascii=False) + "\n")
all_pairs.extend(pairs)
th = sum(1 for d in all_pairs if d["lang"] == "th")
en = sum(1 for d in all_pairs if d["lang"] == "en")
print(f"\nDistilled: {len(all_pairs):,} pairs (Thai: {th:,} | EN: {en:,})")
print(f"Saved → {out_path}")
if __name__ == "__main__":
distill(n_rounds=3)

Xet Storage Details

Size:
8.72 kB
·
Xet hash:
7bfb1f455d96110ea940fef97fcf60b0cf5fdbb95da0ef92399f3fc1dd0d074f

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.