Buckets:
| """ | |
| Phase 1B: Distill QA pairs from local Ollama models. | |
| Strategy: ส่ง context/topic → ให้โมเดลสร้าง Q+A → เก็บคู่คุณภาพสูง | |
| Models: ใช้ทุกโมเดลที่มีใน Ollama แล้ว ensemble คำตอบ | |
| """ | |
| import json | |
| import re | |
| import time | |
| from pathlib import Path | |
| import requests | |
| from tqdm import tqdm | |
| OLLAMA_URL = "http://localhost:11434" | |
| DISTILLED_DIR = Path(__file__).parent / "distilled" | |
| DISTILLED_DIR.mkdir(exist_ok=True) | |
| # ─── Seed Topics ────────────────────────────────────────────────────────────── | |
| # หัวข้อเริ่มต้นสำหรับสร้าง QA ทั้ง Thai และ English | |
| THAI_TOPICS = [ | |
| "ประวัติศาสตร์ไทย", "วิทยาศาสตร์พื้นฐาน", "คณิตศาสตร์", "ภาษาไทย", | |
| "ภูมิศาสตร์ไทย", "สุขภาพและการแพทย์", "เทคโนโลยีและ AI", "เศรษฐกิจไทย", | |
| "วัฒนธรรมไทย", "กฎหมายไทยพื้นฐาน", "ดาราศาสตร์", "ชีววิทยา", | |
| "ฟิสิกส์", "เคมี", "ปรัชญา", "จิตวิทยา", | |
| ] | |
| EN_TOPICS = [ | |
| "world history", "mathematics", "physics", "chemistry", "biology", | |
| "computer science", "philosophy", "economics", "psychology", "astronomy", | |
| "literature", "geography", "medicine", "law basics", "artificial intelligence", | |
| "logic and reasoning", "ethics", "linguistics", | |
| ] | |
| # ─── Ollama Helpers ─────────────────────────────────────────────────────────── | |
| def list_models() -> list[str]: | |
| try: | |
| r = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5) | |
| return [m["name"] for m in r.json().get("models", [])] | |
| except Exception: | |
| return [] | |
| def generate(model: str, prompt: str, temperature: float = 0.7) -> str: | |
| try: | |
| r = requests.post( | |
| f"{OLLAMA_URL}/api/generate", | |
| json={"model": model, "prompt": prompt, "stream": False, | |
| "options": {"temperature": temperature, "num_predict": 512}}, | |
| timeout=120, | |
| ) | |
| return r.json().get("response", "").strip() | |
| except Exception: | |
| return "" | |
| # ─── QA Generation Prompts ──────────────────────────────────────────────────── | |
| def make_thai_prompt(topic: str) -> str: | |
| return f"""สร้างคำถามและคำตอบ 5 ข้อเกี่ยวกับหัวข้อ: {topic} | |
| รูปแบบที่ต้องการ (JSON array): | |
| [ | |
| {{"question": "คำถาม", "answer": "คำตอบที่ถูกต้องและครบถ้วน"}}, | |
| ... | |
| ] | |
| กฎ: | |
| - คำตอบต้องถูกต้อง 100% | |
| - คำตอบต้องกระชับแต่สมบูรณ์ | |
| - ห้ามแต่งเรื่อง | |
| - ตอบเป็น JSON เท่านั้น ไม่มีข้อความอื่น""" | |
| def make_en_prompt(topic: str) -> str: | |
| return f"""Generate 5 high-quality Q&A pairs about: {topic} | |
| Required format (JSON array): | |
| [ | |
| {{"question": "...", "answer": "..."}}, | |
| ... | |
| ] | |
| Rules: | |
| - Answers must be 100% factually correct | |
| - Answers must be concise but complete | |
| - No hallucination | |
| - Return JSON only, no other text""" | |
| def make_reasoning_prompt(lang: str) -> str: | |
| if lang == "th": | |
| return """สร้างคำถามเชิงตรรกะ/คณิตศาสตร์ 3 ข้อพร้อมวิธีคิดและคำตอบ | |
| รูปแบบ JSON: | |
| [ | |
| {{"question": "โจทย์", "reasoning": "วิธีคิดทีละขั้น", "answer": "คำตอบสุดท้าย"}}, | |
| ... | |
| ] | |
| ตอบเป็น JSON เท่านั้น""" | |
| else: | |
| return """Generate 3 logic/math reasoning Q&A pairs with step-by-step solutions. | |
| JSON format: | |
| [ | |
| {"question": "problem", "reasoning": "step by step", "answer": "final answer"}, | |
| ... | |
| ] | |
| Return JSON only.""" | |
| # ─── Parse & Validate ───────────────────────────────────────────────────────── | |
| def extract_json(text: str) -> list[dict]: | |
| text = text.strip() | |
| # หา JSON array ใน text | |
| match = re.search(r'\[[\s\S]*\]', text) | |
| if not match: | |
| return [] | |
| try: | |
| data = json.loads(match.group()) | |
| if not isinstance(data, list): | |
| return [] | |
| valid = [] | |
| for item in data: | |
| q = item.get("question", "").strip() | |
| a = item.get("answer", "").strip() | |
| r = item.get("reasoning", "").strip() | |
| if q and a and len(q) > 5 and len(a) > 5: | |
| valid.append({"question": q, "answer": a, "reasoning": r}) | |
| return valid | |
| except Exception: | |
| return [] | |
| # ─── Multi-Model Ensemble ───────────────────────────────────────────────────── | |
| def ensemble_generate(models: list[str], prompt: str, lang: str, topic: str) -> list[dict]: | |
| """ให้หลายโมเดล generate แล้วรวมผล — เพิ่มความหลากหลายและคุณภาพ""" | |
| results = [] | |
| for model in models: | |
| raw = generate(model, prompt) | |
| pairs = extract_json(raw) | |
| for p in pairs: | |
| p["lang"] = lang | |
| p["source"] = f"distill_{model}" | |
| p["topic"] = topic | |
| p["context"] = "" | |
| results.extend(pairs) | |
| return results | |
| # ─── Main Distillation ──────────────────────────────────────────────────────── | |
| def distill(n_rounds: int = 3): | |
| models = list_models() | |
| if not models: | |
| print("ERROR: ไม่พบ Ollama models — รัน `ollama serve` ก่อน") | |
| return | |
| print(f"Found {len(models)} models: {models}") | |
| print(f"Starting distillation — {n_rounds} rounds per topic\n") | |
| all_pairs = [] | |
| out_path = DISTILLED_DIR / "distilled_qa.jsonl" | |
| with open(out_path, "w", encoding="utf-8") as f: | |
| # Thai topics | |
| for topic in tqdm(THAI_TOPICS, desc="Thai topics"): | |
| for _ in range(n_rounds): | |
| prompt = make_thai_prompt(topic) | |
| pairs = ensemble_generate(models, prompt, "th", topic) | |
| for p in pairs: | |
| f.write(json.dumps(p, ensure_ascii=False) + "\n") | |
| all_pairs.extend(pairs) | |
| time.sleep(0.1) | |
| # English topics | |
| for topic in tqdm(EN_TOPICS, desc="EN topics"): | |
| for _ in range(n_rounds): | |
| prompt = make_en_prompt(topic) | |
| pairs = ensemble_generate(models, prompt, "en", topic) | |
| for p in pairs: | |
| f.write(json.dumps(p, ensure_ascii=False) + "\n") | |
| all_pairs.extend(pairs) | |
| time.sleep(0.1) | |
| # Reasoning pairs (both langs) | |
| print("\nGenerating reasoning pairs ...") | |
| for lang in ["th", "en"]: | |
| for _ in tqdm(range(20), desc=f"Reasoning {lang}"): | |
| prompt = make_reasoning_prompt(lang) | |
| for model in models[:2]: # top 2 models for reasoning | |
| raw = generate(model, prompt, temperature=0.3) | |
| pairs = extract_json(raw) | |
| for p in pairs: | |
| p["lang"] = lang | |
| p["source"] = f"distill_reasoning_{model}" | |
| p["topic"] = "reasoning" | |
| p["context"] = "" | |
| f.write(json.dumps(p, ensure_ascii=False) + "\n") | |
| all_pairs.extend(pairs) | |
| th = sum(1 for d in all_pairs if d["lang"] == "th") | |
| en = sum(1 for d in all_pairs if d["lang"] == "en") | |
| print(f"\nDistilled: {len(all_pairs):,} pairs (Thai: {th:,} | EN: {en:,})") | |
| print(f"Saved → {out_path}") | |
| if __name__ == "__main__": | |
| distill(n_rounds=3) | |
Xet Storage Details
- Size:
- 8.72 kB
- Xet hash:
- 7bfb1f455d96110ea940fef97fcf60b0cf5fdbb95da0ef92399f3fc1dd0d074f
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.